validmind 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +127 -69
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +70 -31
- validmind/client.py +5 -5
- validmind/logging.py +38 -32
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -7
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +4 -49
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +27 -34
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +12 -6
- validmind/vm_models/test_suite/summary.py +18 -7
- validmind/vm_models/test_suite/test.py +13 -20
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/RECORD +95 -104
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -11,7 +11,7 @@ from ragas.metrics import answer_relevancy
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "rag_performance")
|
@@ -108,8 +108,7 @@ def AnswerRelevance(
|
|
108
108
|
df = get_renamed_columns(dataset.df, required_columns)
|
109
109
|
|
110
110
|
result_df = evaluate(
|
111
|
-
Dataset.from_pandas(df),
|
112
|
-
metrics=[answer_relevancy],
|
111
|
+
Dataset.from_pandas(df), metrics=[answer_relevancy], **get_ragas_config()
|
113
112
|
).to_pandas()
|
114
113
|
|
115
114
|
fig_histogram = px.histogram(x=result_df["answer_relevancy"].to_list(), nbins=10)
|
@@ -117,7 +116,9 @@ def AnswerRelevance(
|
|
117
116
|
|
118
117
|
return (
|
119
118
|
{
|
120
|
-
"Scores": result_df[
|
119
|
+
"Scores (will not be uploaded to UI)": result_df[
|
120
|
+
["question", "contexts", "answer", "answer_relevancy"]
|
121
|
+
],
|
121
122
|
"Aggregate Scores": [
|
122
123
|
{
|
123
124
|
"Mean Score": result_df["answer_relevancy"].mean(),
|
@@ -11,7 +11,7 @@ from ragas.metrics import answer_similarity
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm")
|
@@ -93,8 +93,7 @@ def AnswerSimilarity(
|
|
93
93
|
df = get_renamed_columns(dataset.df, required_columns)
|
94
94
|
|
95
95
|
result_df = evaluate(
|
96
|
-
Dataset.from_pandas(df),
|
97
|
-
metrics=[answer_similarity],
|
96
|
+
Dataset.from_pandas(df), metrics=[answer_similarity], **get_ragas_config()
|
98
97
|
).to_pandas()
|
99
98
|
|
100
99
|
fig_histogram = px.histogram(x=result_df["answer_similarity"].to_list(), nbins=10)
|
@@ -102,7 +101,9 @@ def AnswerSimilarity(
|
|
102
101
|
|
103
102
|
return (
|
104
103
|
{
|
105
|
-
"Scores": result_df[
|
104
|
+
"Scores (will not be uploaded to UI)": result_df[
|
105
|
+
["answer", "ground_truth", "answer_similarity"]
|
106
|
+
],
|
106
107
|
"Aggregate Scores": [
|
107
108
|
{
|
108
109
|
"Mean Score": result_df["answer_similarity"].mean(),
|
@@ -18,7 +18,7 @@ from ragas.metrics.critique import (
|
|
18
18
|
|
19
19
|
from validmind import tags, tasks
|
20
20
|
|
21
|
-
from .utils import get_renamed_columns
|
21
|
+
from .utils import get_ragas_config, get_renamed_columns
|
22
22
|
|
23
23
|
aspect_map = {
|
24
24
|
"coherence": coherence,
|
@@ -36,14 +36,14 @@ def AspectCritique(
|
|
36
36
|
question_column="question",
|
37
37
|
answer_column="answer",
|
38
38
|
contexts_column="contexts",
|
39
|
-
aspects: list = [
|
39
|
+
aspects: list = [ # noqa: B006 this is fine as immutable default since it never gets modified
|
40
40
|
"coherence",
|
41
41
|
"conciseness",
|
42
42
|
"correctness",
|
43
43
|
"harmfulness",
|
44
44
|
"maliciousness",
|
45
45
|
],
|
46
|
-
additional_aspects: list =
|
46
|
+
additional_aspects: list = None,
|
47
47
|
):
|
48
48
|
"""
|
49
49
|
Evaluates generations against the following aspects: harmfulness, maliciousness,
|
@@ -131,13 +131,19 @@ def AspectCritique(
|
|
131
131
|
df = get_renamed_columns(dataset.df, required_columns)
|
132
132
|
|
133
133
|
built_in_aspects = [aspect_map[aspect] for aspect in aspects]
|
134
|
-
custom_aspects =
|
135
|
-
|
136
|
-
|
137
|
-
|
134
|
+
custom_aspects = (
|
135
|
+
[
|
136
|
+
_AspectCritique(name=name, definition=description)
|
137
|
+
for name, description in additional_aspects
|
138
|
+
]
|
139
|
+
if additional_aspects
|
140
|
+
else []
|
141
|
+
)
|
138
142
|
all_aspects = [*built_in_aspects, *custom_aspects]
|
139
143
|
|
140
|
-
result_df = evaluate(
|
144
|
+
result_df = evaluate(
|
145
|
+
Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
|
146
|
+
).to_pandas()
|
141
147
|
|
142
148
|
df_melted = result_df.melt(
|
143
149
|
id_vars=["question", "answer", "contexts"],
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_entity_recall
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -99,8 +99,7 @@ def ContextEntityRecall(
|
|
99
99
|
df = get_renamed_columns(dataset.df, required_columns)
|
100
100
|
|
101
101
|
result_df = evaluate(
|
102
|
-
Dataset.from_pandas(df),
|
103
|
-
metrics=[context_entity_recall],
|
102
|
+
Dataset.from_pandas(df), metrics=[context_entity_recall], **get_ragas_config()
|
104
103
|
).to_pandas()
|
105
104
|
|
106
105
|
fig_histogram = px.histogram(
|
@@ -110,7 +109,7 @@ def ContextEntityRecall(
|
|
110
109
|
|
111
110
|
return (
|
112
111
|
{
|
113
|
-
"Scores": result_df[
|
112
|
+
"Scores (will not be uploaded to UI)": result_df[
|
114
113
|
[
|
115
114
|
"contexts",
|
116
115
|
"ground_truth",
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_precision
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -21,7 +21,7 @@ def ContextPrecision(
|
|
21
21
|
question_column: str = "question",
|
22
22
|
contexts_column: str = "contexts",
|
23
23
|
ground_truth_column: str = "ground_truth",
|
24
|
-
):
|
24
|
+
): # noqa: B950
|
25
25
|
"""
|
26
26
|
Context Precision is a metric that evaluates whether all of the ground-truth
|
27
27
|
relevant items present in the contexts are ranked higher or not. Ideally all the
|
@@ -95,8 +95,7 @@ def ContextPrecision(
|
|
95
95
|
df = get_renamed_columns(dataset.df, required_columns)
|
96
96
|
|
97
97
|
result_df = evaluate(
|
98
|
-
Dataset.from_pandas(df),
|
99
|
-
metrics=[context_precision],
|
98
|
+
Dataset.from_pandas(df), metrics=[context_precision], **get_ragas_config()
|
100
99
|
).to_pandas()
|
101
100
|
|
102
101
|
fig_histogram = px.histogram(x=result_df["context_precision"].to_list(), nbins=10)
|
@@ -104,7 +103,7 @@ def ContextPrecision(
|
|
104
103
|
|
105
104
|
return (
|
106
105
|
{
|
107
|
-
"Scores": result_df[
|
106
|
+
"Scores (will not be uploaded to UI)": result_df[
|
108
107
|
["question", "contexts", "ground_truth", "context_precision"]
|
109
108
|
],
|
110
109
|
"Aggregate Scores": [
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_recall
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -95,8 +95,7 @@ def ContextRecall(
|
|
95
95
|
df = get_renamed_columns(dataset.df, required_columns)
|
96
96
|
|
97
97
|
result_df = evaluate(
|
98
|
-
Dataset.from_pandas(df),
|
99
|
-
metrics=[context_recall],
|
98
|
+
Dataset.from_pandas(df), metrics=[context_recall], **get_ragas_config()
|
100
99
|
).to_pandas()
|
101
100
|
|
102
101
|
fig_histogram = px.histogram(x=result_df["context_recall"].to_list(), nbins=10)
|
@@ -104,7 +103,7 @@ def ContextRecall(
|
|
104
103
|
|
105
104
|
return (
|
106
105
|
{
|
107
|
-
"Scores": result_df[
|
106
|
+
"Scores (will not be uploaded to UI)": result_df[
|
108
107
|
["question", "contexts", "ground_truth", "context_recall"]
|
109
108
|
],
|
110
109
|
"Aggregate Scores": [
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_relevancy
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -88,8 +88,7 @@ def ContextRelevancy(
|
|
88
88
|
df = get_renamed_columns(dataset.df, required_columns)
|
89
89
|
|
90
90
|
result_df = evaluate(
|
91
|
-
Dataset.from_pandas(df),
|
92
|
-
metrics=[context_relevancy],
|
91
|
+
Dataset.from_pandas(df), metrics=[context_relevancy], **get_ragas_config()
|
93
92
|
).to_pandas()
|
94
93
|
|
95
94
|
fig_histogram = px.histogram(x=result_df["context_relevancy"].to_list(), nbins=10)
|
@@ -97,7 +96,9 @@ def ContextRelevancy(
|
|
97
96
|
|
98
97
|
return (
|
99
98
|
{
|
100
|
-
"Scores": result_df[
|
99
|
+
"Scores (will not be uploaded to UI)": result_df[
|
100
|
+
["question", "contexts", "context_relevancy"]
|
101
|
+
],
|
101
102
|
"Aggregate Scores": [
|
102
103
|
{
|
103
104
|
"Mean Score": result_df["context_relevancy"].mean(),
|
@@ -11,7 +11,7 @@ from ragas.metrics import faithfulness
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "rag_performance")
|
@@ -20,7 +20,7 @@ def Faithfulness(
|
|
20
20
|
dataset,
|
21
21
|
answer_column="answer",
|
22
22
|
contexts_column="contexts",
|
23
|
-
):
|
23
|
+
): # noqa
|
24
24
|
"""
|
25
25
|
Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
|
26
26
|
|
@@ -93,8 +93,7 @@ def Faithfulness(
|
|
93
93
|
df = get_renamed_columns(dataset.df, required_columns)
|
94
94
|
|
95
95
|
result_df = evaluate(
|
96
|
-
Dataset.from_pandas(df),
|
97
|
-
metrics=[faithfulness],
|
96
|
+
Dataset.from_pandas(df), metrics=[faithfulness], **get_ragas_config()
|
98
97
|
).to_pandas()
|
99
98
|
|
100
99
|
fig_histogram = px.histogram(x=result_df["faithfulness"].to_list(), nbins=10)
|
@@ -102,7 +101,9 @@ def Faithfulness(
|
|
102
101
|
|
103
102
|
return (
|
104
103
|
{
|
105
|
-
"Scores": result_df[
|
104
|
+
"Scores (will not be uploaded to UI)": result_df[
|
105
|
+
["contexts", "answer", "faithfulness"]
|
106
|
+
],
|
106
107
|
"Aggregate Scores": [
|
107
108
|
{
|
108
109
|
"Mean Score": result_df["faithfulness"].mean(),
|
@@ -2,17 +2,42 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import os
|
5
6
|
|
6
|
-
|
7
|
-
if not isinstance(x, dict):
|
8
|
-
raise TypeError(f"Expected a dictionary in column '{root_col}', got {type(x)}.")
|
7
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
9
8
|
|
10
|
-
|
11
|
-
raise KeyError(
|
12
|
-
f"Sub-column '{sub_col}' not found in dictionary in column '{root_col}'."
|
13
|
-
)
|
9
|
+
from validmind.ai.utils import get_client_and_model
|
14
10
|
|
15
|
-
|
11
|
+
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
12
|
+
|
13
|
+
|
14
|
+
def get_ragas_config():
|
15
|
+
client, model = get_client_and_model()
|
16
|
+
os.environ["OPENAI_API_BASE"] = str(client.base_url)
|
17
|
+
|
18
|
+
return {
|
19
|
+
"llm": ChatOpenAI(api_key=client.api_key, model=model),
|
20
|
+
"embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
|
21
|
+
}
|
22
|
+
|
23
|
+
|
24
|
+
def make_sub_col_udf(root_col, sub_col):
|
25
|
+
"""Create a udf that extracts sub-column values from a dictionary."""
|
26
|
+
|
27
|
+
def _udf_get_sub_col(x):
|
28
|
+
if not isinstance(x, dict):
|
29
|
+
raise TypeError(
|
30
|
+
f"Expected a dictionary in column '{root_col}', got {type(x)}."
|
31
|
+
)
|
32
|
+
|
33
|
+
if sub_col not in x:
|
34
|
+
raise KeyError(
|
35
|
+
f"Sub-column '{sub_col}' not found in dictionary in column '{root_col}'."
|
36
|
+
)
|
37
|
+
|
38
|
+
return x[sub_col]
|
39
|
+
|
40
|
+
return _udf_get_sub_col
|
16
41
|
|
17
42
|
|
18
43
|
def get_renamed_columns(df, column_map):
|
@@ -34,6 +59,7 @@ def get_renamed_columns(df, column_map):
|
|
34
59
|
Returns:
|
35
60
|
pd.DataFrame: The DataFrame with columns renamed.
|
36
61
|
"""
|
62
|
+
|
37
63
|
new_df = df.copy()
|
38
64
|
|
39
65
|
for new_name, source in column_map.items():
|
@@ -50,7 +76,7 @@ def get_renamed_columns(df, column_map):
|
|
50
76
|
|
51
77
|
if root_col in new_df.columns:
|
52
78
|
new_df[new_name] = new_df[root_col].apply(
|
53
|
-
|
79
|
+
make_sub_col_udf(root_col, sub_col)
|
54
80
|
)
|
55
81
|
|
56
82
|
else:
|
@@ -66,7 +66,7 @@ class ClusterPerformance(Metric):
|
|
66
66
|
y_true_test = y_true_test.astype(y_pred_test.dtype).flatten()
|
67
67
|
results = []
|
68
68
|
for metric_name, metric_fcn in metric_info.items():
|
69
|
-
for
|
69
|
+
for _ in samples:
|
70
70
|
train_value = metric_fcn(list(y_true_train), y_pred_train)
|
71
71
|
test_value = metric_fcn(list(y_true_test), y_pred_test)
|
72
72
|
results.append(
|
@@ -85,7 +85,7 @@ class ClusterPerformance(Metric):
|
|
85
85
|
"""
|
86
86
|
table_records = []
|
87
87
|
for result in raw_results:
|
88
|
-
for key,
|
88
|
+
for key, _ in result.items():
|
89
89
|
table_records.append(
|
90
90
|
{
|
91
91
|
"Metric": key,
|
@@ -52,7 +52,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
52
52
|
"""
|
53
53
|
|
54
54
|
name = "models_performance_comparison"
|
55
|
-
required_inputs = ["
|
55
|
+
required_inputs = ["dataset", "models"]
|
56
56
|
metadata = {
|
57
57
|
"task_types": ["classification", "text_classification"],
|
58
58
|
"tags": [
|
@@ -70,12 +70,12 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
70
70
|
"""
|
71
71
|
results = []
|
72
72
|
prf_table = []
|
73
|
-
classes = {str(i) for i in unique(self.
|
73
|
+
classes = {str(i) for i in unique(self.inputs.dataset.y)}
|
74
74
|
|
75
75
|
for class_name in classes:
|
76
76
|
prf_dict = {}
|
77
77
|
prf_dict["Class"] = class_name
|
78
|
-
for m,
|
78
|
+
for m, _ in metric_value.items():
|
79
79
|
prf_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
|
80
80
|
prf_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
|
81
81
|
prf_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
|
@@ -85,7 +85,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
85
85
|
for class_name in avg_metrics:
|
86
86
|
avg_dict = {}
|
87
87
|
avg_dict["Class"] = class_name
|
88
|
-
for m,
|
88
|
+
for m, _ in metric_value.items():
|
89
89
|
avg_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
|
90
90
|
avg_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
|
91
91
|
avg_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
|
@@ -103,7 +103,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
103
103
|
for metric_name in ["accuracy", "roc_auc"]:
|
104
104
|
acc_roc_auc_dict = {}
|
105
105
|
acc_roc_auc_dict["Metric"] = metric_name
|
106
|
-
for m,
|
106
|
+
for m, _ in metric_value.items():
|
107
107
|
acc_roc_auc_dict[f"accuracy- {m}"] = metric_value[m]["accuracy"]
|
108
108
|
acc_roc_auc_dict[f"roc_auc- {m}"] = metric_value[m]["roc_auc"]
|
109
109
|
acc_roc_auc_table.append(acc_roc_auc_dict)
|
@@ -122,10 +122,8 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
122
122
|
"List of models must be provided as a `models` parameter to compare performance"
|
123
123
|
)
|
124
124
|
|
125
|
-
all_models =
|
125
|
+
all_models = self.inputs.models
|
126
126
|
|
127
|
-
if self.inputs.models is not None:
|
128
|
-
all_models.extend(self.inputs.models)
|
129
127
|
results = {}
|
130
128
|
for idx, model in enumerate(all_models):
|
131
129
|
y_true = self.inputs.dataset.y
|
@@ -9,8 +9,11 @@ import numpy as np
|
|
9
9
|
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
10
10
|
|
11
11
|
from validmind.errors import SkipTestError
|
12
|
+
from validmind.logging import get_logger
|
12
13
|
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
13
14
|
|
15
|
+
logger = get_logger(__name__)
|
16
|
+
|
14
17
|
|
15
18
|
@dataclass
|
16
19
|
class RegressionModelsPerformanceComparison(Metric):
|
@@ -56,7 +59,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
56
59
|
"""
|
57
60
|
|
58
61
|
name = "models_performance_comparison"
|
59
|
-
required_inputs = ["
|
62
|
+
required_inputs = ["dataset", "models"]
|
60
63
|
|
61
64
|
metadata = {
|
62
65
|
"task_types": ["regression"],
|
@@ -76,8 +79,14 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
76
79
|
results["Mean Squared Error (MSE)"] = mse_test
|
77
80
|
results["Root Mean Squared Error (RMSE)"] = np.sqrt(mse_test)
|
78
81
|
|
79
|
-
|
80
|
-
|
82
|
+
if np.any(y_true_test == 0):
|
83
|
+
logger.warning(
|
84
|
+
"y_true_test contains zero values. Skipping MAPE calculation to avoid division by zero."
|
85
|
+
)
|
86
|
+
results["Mean Absolute Percentage Error (MAPE)"] = None
|
87
|
+
else:
|
88
|
+
mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100
|
89
|
+
results["Mean Absolute Percentage Error (MAPE)"] = mape_test
|
81
90
|
|
82
91
|
mbd_test = np.mean(y_pred_test - y_true_test)
|
83
92
|
results["Mean Bias Deviation (MBD)"] = mbd_test
|
@@ -94,7 +103,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
94
103
|
for metric_name in metrics:
|
95
104
|
errors_dict = {}
|
96
105
|
errors_dict["Errors"] = metric_name
|
97
|
-
for m,
|
106
|
+
for m, _ in metric_value.items():
|
98
107
|
for metric in metrics:
|
99
108
|
res = re.findall(r"\(.*?\)", metric)
|
100
109
|
res[0][1:-1]
|
@@ -117,10 +126,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
117
126
|
"List of models must be provided as a `models` parameter to compare performance"
|
118
127
|
)
|
119
128
|
|
120
|
-
all_models =
|
121
|
-
|
122
|
-
if self.inputs.models is not None:
|
123
|
-
all_models.extend(self.inputs.models)
|
129
|
+
all_models = self.inputs.models
|
124
130
|
|
125
131
|
results = {}
|
126
132
|
|
@@ -53,7 +53,7 @@ class DurbinWatsonTest(Metric):
|
|
53
53
|
"""
|
54
54
|
Calculates DB for each of the dataset features
|
55
55
|
"""
|
56
|
-
x_train = self.
|
56
|
+
x_train = self.inputs.dataset.df
|
57
57
|
dw_values = {}
|
58
58
|
for col in x_train.columns:
|
59
59
|
dw_values[col] = durbin_watson(x_train[col].values)
|
@@ -80,7 +80,7 @@ class GINITable(Metric):
|
|
80
80
|
metrics_dict = {"Dataset": [], "AUC": [], "GINI": [], "KS": []}
|
81
81
|
|
82
82
|
# Iterate over each dataset in the inputs
|
83
|
-
for
|
83
|
+
for _, dataset in enumerate(self.inputs.datasets):
|
84
84
|
dataset_label = (
|
85
85
|
dataset.input_id
|
86
86
|
) # Use input_id as the label for each dataset
|
@@ -87,7 +87,7 @@ class KolmogorovSmirnov(Metric):
|
|
87
87
|
if data_distribution not in ["norm" or "exp"]:
|
88
88
|
InvalidTestParametersError("Dist parameter must be either 'norm' or 'exp'")
|
89
89
|
|
90
|
-
x_train = self.inputs.dataset.df
|
90
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
91
91
|
ks_values = {}
|
92
92
|
for col in x_train.columns:
|
93
93
|
ks_stat, p_value = kstest_normal(x_train[col].values, data_distribution)
|
@@ -70,7 +70,7 @@ class Lilliefors(Metric):
|
|
70
70
|
"""
|
71
71
|
Calculates Lilliefors test for each of the dataset features
|
72
72
|
"""
|
73
|
-
x_train = self.
|
73
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
74
74
|
|
75
75
|
lilliefors_values = {}
|
76
76
|
for col in x_train.columns:
|
@@ -8,6 +8,7 @@ import pandas as pd
|
|
8
8
|
import plotly.graph_objects as go
|
9
9
|
from scipy import stats
|
10
10
|
|
11
|
+
from validmind.errors import SkipTestError
|
11
12
|
from validmind.vm_models import Figure, Metric
|
12
13
|
|
13
14
|
|
@@ -115,6 +116,9 @@ class RegressionCoeffsPlot(Metric):
|
|
115
116
|
all_models.extend(self.inputs.models)
|
116
117
|
|
117
118
|
for i, model in enumerate(all_models):
|
119
|
+
if model.library != "statsmodels":
|
120
|
+
raise SkipTestError("Only statsmodels are supported for this metric")
|
121
|
+
|
118
122
|
model_name = f"Model {i+1}"
|
119
123
|
|
120
124
|
fig, metric_values = self.plot_coefficients_with_ci(model, model_name)
|
@@ -7,6 +7,7 @@ from dataclasses import dataclass
|
|
7
7
|
import matplotlib.pyplot as plt
|
8
8
|
import seaborn as sns
|
9
9
|
|
10
|
+
from validmind.errors import SkipTestError
|
10
11
|
from validmind.logging import get_logger
|
11
12
|
from validmind.vm_models import Figure, Metric
|
12
13
|
|
@@ -82,10 +83,14 @@ class RegressionFeatureSignificance(Metric):
|
|
82
83
|
# Initialize a list to store figures
|
83
84
|
figures = []
|
84
85
|
|
85
|
-
for i,
|
86
|
+
for i, model in enumerate(model_list):
|
87
|
+
|
88
|
+
if model.library != "statsmodels":
|
89
|
+
raise SkipTestError("Only statsmodels are supported for this metric")
|
90
|
+
|
86
91
|
# Get the coefficients and p-values from the model
|
87
|
-
coefficients =
|
88
|
-
pvalues =
|
92
|
+
coefficients = model.model.params
|
93
|
+
pvalues = model.model.pvalues
|
89
94
|
|
90
95
|
# Sort the variables by p-value in ascending order
|
91
96
|
sorted_idx = pvalues.argsort()
|
@@ -122,7 +127,7 @@ class RegressionFeatureSignificance(Metric):
|
|
122
127
|
for_object=self,
|
123
128
|
key=f"{self.key}:{i}",
|
124
129
|
figure=fig,
|
125
|
-
metadata={"model": str(
|
130
|
+
metadata={"model": str(model.model)},
|
126
131
|
)
|
127
132
|
)
|
128
133
|
plt.close("all")
|
@@ -73,9 +73,9 @@ class RegressionModelsCoeffs(Metric):
|
|
73
73
|
raise ValueError("List of models must be provided in the models parameter")
|
74
74
|
|
75
75
|
for model in self.inputs.models:
|
76
|
-
if model.
|
76
|
+
if model.library != "statsmodels":
|
77
77
|
raise SkipTestError(
|
78
|
-
"Only statsmodels
|
78
|
+
"Only statsmodels models are supported for this metric"
|
79
79
|
)
|
80
80
|
|
81
81
|
coefficients = [m.regression_coefficients() for m in self.inputs.models]
|
@@ -59,7 +59,7 @@ class RunsTest(Metric):
|
|
59
59
|
"""
|
60
60
|
Calculates the run test for each of the dataset features
|
61
61
|
"""
|
62
|
-
x_train = self.inputs.dataset.df
|
62
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
63
63
|
|
64
64
|
runs_test_values = {}
|
65
65
|
for col in x_train.columns:
|
@@ -53,7 +53,7 @@ class ShapiroWilk(Metric):
|
|
53
53
|
"""
|
54
54
|
Calculates Shapiro-Wilk test for each of the dataset features.
|
55
55
|
"""
|
56
|
-
x_train = self.inputs.dataset.df
|
56
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
57
57
|
sw_values = {}
|
58
58
|
for col in x_train.columns:
|
59
59
|
sw_stat, sw_pvalue = stats.shapiro(x_train[col].values)
|