validmind 2.2.6__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +74 -82
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +58 -19
- validmind/client.py +5 -5
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -7
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +4 -63
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +2 -2
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +7 -4
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/RECORD +92 -101
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -2,8 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import
|
6
|
-
import
|
5
|
+
import pandas as pd
|
6
|
+
import plotly.graph_objects as go
|
7
7
|
|
8
8
|
from validmind.vm_models import Figure, Metric
|
9
9
|
|
@@ -50,45 +50,41 @@ class TabularDateTimeHistograms(Metric):
|
|
50
50
|
|
51
51
|
metadata = {
|
52
52
|
"task_types": ["classification", "regression"],
|
53
|
-
"tags": ["
|
53
|
+
"tags": ["time_series_data", "visualization"],
|
54
54
|
}
|
55
55
|
|
56
56
|
def run(self):
|
57
57
|
df = self.inputs.dataset.df
|
58
58
|
|
59
|
-
#
|
60
|
-
|
61
|
-
|
62
|
-
if len(datetime_columns) == 0:
|
63
|
-
raise ValueError("No datetime columns found in the dataset")
|
59
|
+
# Check if the index is a datetime type
|
60
|
+
if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
61
|
+
raise ValueError("Index must be a datetime type")
|
64
62
|
|
65
63
|
figures = []
|
66
|
-
for col in datetime_columns:
|
67
|
-
plt.figure()
|
68
|
-
fig, _ = plt.subplots()
|
69
|
-
|
70
|
-
# Calculate the difference between consecutive dates and convert to days
|
71
|
-
date_diffs = df[col].sort_values().diff().dt.days.dropna()
|
72
|
-
|
73
|
-
# Filter out 0 values
|
74
|
-
date_diffs = date_diffs[date_diffs != 0]
|
75
|
-
|
76
|
-
ax = sns.histplot(date_diffs, kde=False, bins=30)
|
77
|
-
plt.title(f"{col}", weight="bold", fontsize=20)
|
78
|
-
|
79
|
-
plt.xticks(fontsize=18)
|
80
|
-
plt.yticks(fontsize=18)
|
81
|
-
ax.set_xlabel("Days Between Consecutive Dates", fontsize=18)
|
82
|
-
ax.set_ylabel("Frequency", fontsize=18)
|
83
|
-
figures.append(
|
84
|
-
Figure(
|
85
|
-
for_object=self,
|
86
|
-
key=f"{self.key}:{col}",
|
87
|
-
figure=fig,
|
88
|
-
)
|
89
|
-
)
|
90
64
|
|
91
|
-
|
65
|
+
# Calculate the difference between consecutive dates in the index
|
66
|
+
date_diffs = df.index.to_series().sort_values().diff().dt.days.dropna()
|
67
|
+
|
68
|
+
# Filter out 0 values
|
69
|
+
date_diffs = date_diffs[date_diffs != 0]
|
70
|
+
|
71
|
+
# Create a histogram using Plotly
|
72
|
+
fig = go.Figure()
|
73
|
+
fig.add_trace(go.Histogram(x=date_diffs, nbinsx=30))
|
74
|
+
fig.update_layout(
|
75
|
+
title="Index",
|
76
|
+
xaxis_title="Days Between Consecutive Dates",
|
77
|
+
yaxis_title="Frequency",
|
78
|
+
font=dict(size=18),
|
79
|
+
)
|
80
|
+
|
81
|
+
figures.append(
|
82
|
+
Figure(
|
83
|
+
for_object=self,
|
84
|
+
key=f"{self.key}:index",
|
85
|
+
figure=fig,
|
86
|
+
)
|
87
|
+
)
|
92
88
|
|
93
89
|
return self.cache_results(
|
94
90
|
figures=figures,
|
@@ -58,7 +58,7 @@ class WOEBinPlots(Metric):
|
|
58
58
|
"""
|
59
59
|
|
60
60
|
name = "woe_bin_plots"
|
61
|
-
|
61
|
+
required_inputs = ["dataset"]
|
62
62
|
default_params = {"breaks_adj": None, "fig_height": 600, "fig_width": 500}
|
63
63
|
metadata = {
|
64
64
|
"task_types": ["classification"],
|
@@ -4,9 +4,14 @@
|
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
|
7
|
+
import pandas as pd
|
7
8
|
from arch.unitroot import ZivotAndrews
|
9
|
+
from numpy.linalg import LinAlgError
|
8
10
|
|
9
|
-
from validmind.
|
11
|
+
from validmind.logging import get_logger
|
12
|
+
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
13
|
+
|
14
|
+
logger = get_logger(__name__)
|
10
15
|
|
11
16
|
|
12
17
|
@dataclass
|
@@ -57,14 +62,63 @@ class ZivotAndrewsArch(Metric):
|
|
57
62
|
"""
|
58
63
|
dataset = self.inputs.dataset.df
|
59
64
|
|
60
|
-
|
65
|
+
# Check if the dataset is a time series
|
66
|
+
if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
67
|
+
raise ValueError(
|
68
|
+
"Dataset index must be a datetime or period index for time series analysis."
|
69
|
+
)
|
70
|
+
|
71
|
+
# Preprocessing: Drop rows with any NaN values
|
72
|
+
if dataset.isnull().values.any():
|
73
|
+
logger.warning(
|
74
|
+
"Dataset contains missing values. Rows with NaNs will be dropped."
|
75
|
+
)
|
76
|
+
dataset = dataset.dropna()
|
77
|
+
|
78
|
+
# Convert to numeric and handle non-numeric data
|
79
|
+
dataset = dataset.apply(pd.to_numeric, errors="coerce")
|
80
|
+
|
81
|
+
# Initialize a list to store Zivot-Andrews results
|
82
|
+
za_values = []
|
83
|
+
|
61
84
|
for col in dataset.columns:
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
85
|
+
try:
|
86
|
+
za = ZivotAndrews(dataset[col].values)
|
87
|
+
za_values.append(
|
88
|
+
{
|
89
|
+
"Variable": col,
|
90
|
+
"stat": za.stat,
|
91
|
+
"pvalue": za.pvalue,
|
92
|
+
"usedlag": za.lags,
|
93
|
+
"nobs": za.nobs,
|
94
|
+
}
|
95
|
+
)
|
96
|
+
except (LinAlgError, ValueError) as e:
|
97
|
+
logger.error(f"Error while processing column '{col}'. Details: {e}")
|
98
|
+
za_values.append(
|
99
|
+
{
|
100
|
+
"Variable": col,
|
101
|
+
"stat": None,
|
102
|
+
"pvalue": None,
|
103
|
+
"usedlag": None,
|
104
|
+
"nobs": None,
|
105
|
+
"error": str(e),
|
106
|
+
}
|
107
|
+
)
|
108
|
+
|
109
|
+
return self.cache_results({"zivot_andrews_results": za_values})
|
110
|
+
|
111
|
+
def summary(self, metric_value):
|
112
|
+
"""
|
113
|
+
Build a table for summarizing the Zivot-Andrews results
|
114
|
+
"""
|
115
|
+
za_results = metric_value["zivot_andrews_results"]
|
116
|
+
|
117
|
+
return ResultSummary(
|
118
|
+
results=[
|
119
|
+
ResultTable(
|
120
|
+
data=za_results,
|
121
|
+
metadata=ResultTableMetadata(title="Zivot-Andrews Test Results"),
|
122
|
+
)
|
123
|
+
]
|
124
|
+
)
|
@@ -52,7 +52,7 @@ class CommonWords(Metric):
|
|
52
52
|
"""
|
53
53
|
|
54
54
|
name = "common_words"
|
55
|
-
required_inputs = ["dataset"
|
55
|
+
required_inputs = ["dataset"]
|
56
56
|
metadata = {
|
57
57
|
"task_types": ["text_classification", "text_summarization"],
|
58
58
|
"tags": ["nlp", "text_data", "visualization", "frequency_analysis"],
|
@@ -54,7 +54,7 @@ class Hashtags(ThresholdTest):
|
|
54
54
|
"""
|
55
55
|
|
56
56
|
name = "hashtags"
|
57
|
-
required_inputs = ["dataset"
|
57
|
+
required_inputs = ["dataset"]
|
58
58
|
default_params = {"top_hashtags": 25}
|
59
59
|
metadata = {
|
60
60
|
"task_types": ["text_classification", "text_summarization"],
|
@@ -54,7 +54,7 @@ class Mentions(ThresholdTest):
|
|
54
54
|
|
55
55
|
name = "mentions"
|
56
56
|
|
57
|
-
required_inputs = ["dataset"
|
57
|
+
required_inputs = ["dataset"]
|
58
58
|
default_params = {"top_mentions": 25}
|
59
59
|
metadata = {
|
60
60
|
"task_types": ["text_classification", "text_summarization"],
|
@@ -10,7 +10,7 @@ from textblob import TextBlob
|
|
10
10
|
from validmind import tags, tasks
|
11
11
|
|
12
12
|
|
13
|
-
@tags("data_validation")
|
13
|
+
@tags("nlp", "text_data", "data_validation")
|
14
14
|
@tasks("nlp")
|
15
15
|
def PolarityAndSubjectivity(dataset):
|
16
16
|
"""
|
@@ -27,6 +27,7 @@ def PolarityAndSubjectivity(dataset):
|
|
27
27
|
Returns:
|
28
28
|
plotly.graph_objs._figure.Figure: A Plotly scatter plot of polarity vs subjectivity.
|
29
29
|
"""
|
30
|
+
|
30
31
|
# Function to calculate sentiment and subjectivity
|
31
32
|
def analyze_sentiment(text):
|
32
33
|
analysis = TextBlob(text)
|
@@ -51,7 +51,7 @@ class Punctuations(Metric):
|
|
51
51
|
"""
|
52
52
|
|
53
53
|
name = "punctuations"
|
54
|
-
required_inputs = ["dataset"
|
54
|
+
required_inputs = ["dataset"]
|
55
55
|
metadata = {
|
56
56
|
"task_types": ["text_classification", "text_summarization"],
|
57
57
|
"tags": ["nlp", "text_data", "visualization", "frequency_analysis"],
|
@@ -60,7 +60,7 @@ class TextDescription(Metric):
|
|
60
60
|
"""
|
61
61
|
|
62
62
|
name = "text_description"
|
63
|
-
required_inputs = ["dataset"
|
63
|
+
required_inputs = ["dataset"]
|
64
64
|
default_params = {
|
65
65
|
"unwanted_tokens": {
|
66
66
|
"s",
|
@@ -79,6 +79,10 @@ class TextDescription(Metric):
|
|
79
79
|
"num_top_words": 3,
|
80
80
|
"lang": "english",
|
81
81
|
}
|
82
|
+
metadata = {
|
83
|
+
"task_types": ["text_classification", "text_summarization"],
|
84
|
+
"tags": ["nlp", "text_data", "visualization"],
|
85
|
+
}
|
82
86
|
|
83
87
|
def general_text_metrics(self, df, text_column):
|
84
88
|
nltk.download("punkt", quiet=True)
|
validmind/tests/decorator.py
CHANGED
@@ -13,9 +13,9 @@ from uuid import uuid4
|
|
13
13
|
|
14
14
|
import pandas as pd
|
15
15
|
|
16
|
+
from validmind.ai.test_descriptions import get_description_metadata
|
16
17
|
from validmind.errors import MissingRequiredTestInputError
|
17
18
|
from validmind.logging import get_logger
|
18
|
-
from validmind.utils import get_description_metadata
|
19
19
|
from validmind.vm_models import (
|
20
20
|
Metric,
|
21
21
|
MetricResult,
|
@@ -55,10 +55,12 @@ class FeaturesAUC(Metric):
|
|
55
55
|
}
|
56
56
|
|
57
57
|
def run(self):
|
58
|
-
|
59
|
-
|
58
|
+
dataset = self.inputs.dataset
|
59
|
+
x = dataset.x_df()
|
60
|
+
y = dataset.y_df()
|
61
|
+
n_targets = dataset.df[dataset.target_column].nunique()
|
60
62
|
|
61
|
-
if
|
63
|
+
if n_targets != 2:
|
62
64
|
raise SkipTestError("FeaturesAUC metric requires a binary target variable.")
|
63
65
|
|
64
66
|
aucs = pd.DataFrame(index=x.columns, columns=["AUC"])
|
@@ -9,7 +9,11 @@ import pandas as pd
|
|
9
9
|
import plotly.express as px
|
10
10
|
from sklearn.metrics.pairwise import cosine_similarity
|
11
11
|
|
12
|
+
from validmind import tags, tasks
|
12
13
|
|
14
|
+
|
15
|
+
@tags("visualization", "dimensionality_reduction", "embeddings")
|
16
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
13
17
|
def CosineSimilarityComparison(dataset, models):
|
14
18
|
"""
|
15
19
|
Computes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,
|
@@ -6,7 +6,11 @@ import numpy as np
|
|
6
6
|
import plotly.express as px
|
7
7
|
from sklearn.metrics.pairwise import cosine_similarity
|
8
8
|
|
9
|
+
from validmind import tags, tasks
|
9
10
|
|
11
|
+
|
12
|
+
@tags("visualization", "dimensionality_reduction", "embeddings")
|
13
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
10
14
|
def CosineSimilarityHeatmap(
|
11
15
|
dataset,
|
12
16
|
model,
|
@@ -9,7 +9,11 @@ import pandas as pd
|
|
9
9
|
import plotly.express as px
|
10
10
|
from sklearn.metrics.pairwise import euclidean_distances
|
11
11
|
|
12
|
+
from validmind import tags, tasks
|
12
13
|
|
14
|
+
|
15
|
+
@tags("visualization", "dimensionality_reduction", "embeddings")
|
16
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
13
17
|
def EuclideanDistanceComparison(dataset, models):
|
14
18
|
"""
|
15
19
|
Computes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,
|
@@ -6,7 +6,11 @@ import numpy as np
|
|
6
6
|
import plotly.express as px
|
7
7
|
from sklearn.metrics.pairwise import euclidean_distances
|
8
8
|
|
9
|
+
from validmind import tags, tasks
|
9
10
|
|
11
|
+
|
12
|
+
@tags("visualization", "dimensionality_reduction", "embeddings")
|
13
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
10
14
|
def EuclideanDistanceHeatmap(
|
11
15
|
dataset,
|
12
16
|
model,
|
@@ -10,7 +10,11 @@ import plotly.express as px
|
|
10
10
|
from sklearn.decomposition import PCA
|
11
11
|
from sklearn.preprocessing import StandardScaler
|
12
12
|
|
13
|
+
from validmind import tags, tasks
|
13
14
|
|
15
|
+
|
16
|
+
@tags("visualization", "dimensionality_reduction", "embeddings")
|
17
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
14
18
|
def PCAComponentsPairwisePlots(dataset, model, n_components=3):
|
15
19
|
"""
|
16
20
|
Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings.
|
@@ -10,7 +10,11 @@ import plotly.express as px
|
|
10
10
|
from sklearn.manifold import TSNE
|
11
11
|
from sklearn.preprocessing import StandardScaler
|
12
12
|
|
13
|
+
from validmind import tags, tasks
|
13
14
|
|
15
|
+
|
16
|
+
@tags("visualization", "dimensionality_reduction", "embeddings")
|
17
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
14
18
|
def TSNEComponentsPairwisePlots(
|
15
19
|
dataset,
|
16
20
|
model,
|
@@ -11,7 +11,7 @@ from ragas.metrics import answer_correctness
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm")
|
@@ -104,7 +104,7 @@ def AnswerCorrectness(
|
|
104
104
|
df = get_renamed_columns(dataset.df, required_columns)
|
105
105
|
|
106
106
|
result_df = evaluate(
|
107
|
-
Dataset.from_pandas(df), metrics=[answer_correctness]
|
107
|
+
Dataset.from_pandas(df), metrics=[answer_correctness], **get_ragas_config()
|
108
108
|
).to_pandas()
|
109
109
|
|
110
110
|
fig_histogram = px.histogram(x=result_df["answer_correctness"].to_list(), nbins=10)
|
@@ -112,7 +112,7 @@ def AnswerCorrectness(
|
|
112
112
|
|
113
113
|
return (
|
114
114
|
{
|
115
|
-
"Scores": result_df[
|
115
|
+
"Scores (will not be uploaded to UI)": result_df[
|
116
116
|
["question", "answer", "ground_truth", "answer_correctness"]
|
117
117
|
],
|
118
118
|
"Aggregate Scores": [
|
@@ -11,7 +11,7 @@ from ragas.metrics import answer_relevancy
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "rag_performance")
|
@@ -108,8 +108,7 @@ def AnswerRelevance(
|
|
108
108
|
df = get_renamed_columns(dataset.df, required_columns)
|
109
109
|
|
110
110
|
result_df = evaluate(
|
111
|
-
Dataset.from_pandas(df),
|
112
|
-
metrics=[answer_relevancy],
|
111
|
+
Dataset.from_pandas(df), metrics=[answer_relevancy], **get_ragas_config()
|
113
112
|
).to_pandas()
|
114
113
|
|
115
114
|
fig_histogram = px.histogram(x=result_df["answer_relevancy"].to_list(), nbins=10)
|
@@ -117,7 +116,9 @@ def AnswerRelevance(
|
|
117
116
|
|
118
117
|
return (
|
119
118
|
{
|
120
|
-
"Scores": result_df[
|
119
|
+
"Scores (will not be uploaded to UI)": result_df[
|
120
|
+
["question", "contexts", "answer", "answer_relevancy"]
|
121
|
+
],
|
121
122
|
"Aggregate Scores": [
|
122
123
|
{
|
123
124
|
"Mean Score": result_df["answer_relevancy"].mean(),
|
@@ -11,7 +11,7 @@ from ragas.metrics import answer_similarity
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm")
|
@@ -93,8 +93,7 @@ def AnswerSimilarity(
|
|
93
93
|
df = get_renamed_columns(dataset.df, required_columns)
|
94
94
|
|
95
95
|
result_df = evaluate(
|
96
|
-
Dataset.from_pandas(df),
|
97
|
-
metrics=[answer_similarity],
|
96
|
+
Dataset.from_pandas(df), metrics=[answer_similarity], **get_ragas_config()
|
98
97
|
).to_pandas()
|
99
98
|
|
100
99
|
fig_histogram = px.histogram(x=result_df["answer_similarity"].to_list(), nbins=10)
|
@@ -102,7 +101,9 @@ def AnswerSimilarity(
|
|
102
101
|
|
103
102
|
return (
|
104
103
|
{
|
105
|
-
"Scores": result_df[
|
104
|
+
"Scores (will not be uploaded to UI)": result_df[
|
105
|
+
["answer", "ground_truth", "answer_similarity"]
|
106
|
+
],
|
106
107
|
"Aggregate Scores": [
|
107
108
|
{
|
108
109
|
"Mean Score": result_df["answer_similarity"].mean(),
|
@@ -18,7 +18,7 @@ from ragas.metrics.critique import (
|
|
18
18
|
|
19
19
|
from validmind import tags, tasks
|
20
20
|
|
21
|
-
from .utils import get_renamed_columns
|
21
|
+
from .utils import get_ragas_config, get_renamed_columns
|
22
22
|
|
23
23
|
aspect_map = {
|
24
24
|
"coherence": coherence,
|
@@ -36,14 +36,14 @@ def AspectCritique(
|
|
36
36
|
question_column="question",
|
37
37
|
answer_column="answer",
|
38
38
|
contexts_column="contexts",
|
39
|
-
aspects: list = [
|
39
|
+
aspects: list = [ # noqa: B006 this is fine as immutable default since it never gets modified
|
40
40
|
"coherence",
|
41
41
|
"conciseness",
|
42
42
|
"correctness",
|
43
43
|
"harmfulness",
|
44
44
|
"maliciousness",
|
45
45
|
],
|
46
|
-
additional_aspects: list =
|
46
|
+
additional_aspects: list = None,
|
47
47
|
):
|
48
48
|
"""
|
49
49
|
Evaluates generations against the following aspects: harmfulness, maliciousness,
|
@@ -131,13 +131,19 @@ def AspectCritique(
|
|
131
131
|
df = get_renamed_columns(dataset.df, required_columns)
|
132
132
|
|
133
133
|
built_in_aspects = [aspect_map[aspect] for aspect in aspects]
|
134
|
-
custom_aspects =
|
135
|
-
|
136
|
-
|
137
|
-
|
134
|
+
custom_aspects = (
|
135
|
+
[
|
136
|
+
_AspectCritique(name=name, definition=description)
|
137
|
+
for name, description in additional_aspects
|
138
|
+
]
|
139
|
+
if additional_aspects
|
140
|
+
else []
|
141
|
+
)
|
138
142
|
all_aspects = [*built_in_aspects, *custom_aspects]
|
139
143
|
|
140
|
-
result_df = evaluate(
|
144
|
+
result_df = evaluate(
|
145
|
+
Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
|
146
|
+
).to_pandas()
|
141
147
|
|
142
148
|
df_melted = result_df.melt(
|
143
149
|
id_vars=["question", "answer", "contexts"],
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_entity_recall
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -99,8 +99,7 @@ def ContextEntityRecall(
|
|
99
99
|
df = get_renamed_columns(dataset.df, required_columns)
|
100
100
|
|
101
101
|
result_df = evaluate(
|
102
|
-
Dataset.from_pandas(df),
|
103
|
-
metrics=[context_entity_recall],
|
102
|
+
Dataset.from_pandas(df), metrics=[context_entity_recall], **get_ragas_config()
|
104
103
|
).to_pandas()
|
105
104
|
|
106
105
|
fig_histogram = px.histogram(
|
@@ -110,7 +109,7 @@ def ContextEntityRecall(
|
|
110
109
|
|
111
110
|
return (
|
112
111
|
{
|
113
|
-
"Scores": result_df[
|
112
|
+
"Scores (will not be uploaded to UI)": result_df[
|
114
113
|
[
|
115
114
|
"contexts",
|
116
115
|
"ground_truth",
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_precision
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -21,7 +21,7 @@ def ContextPrecision(
|
|
21
21
|
question_column: str = "question",
|
22
22
|
contexts_column: str = "contexts",
|
23
23
|
ground_truth_column: str = "ground_truth",
|
24
|
-
):
|
24
|
+
): # noqa: B950
|
25
25
|
"""
|
26
26
|
Context Precision is a metric that evaluates whether all of the ground-truth
|
27
27
|
relevant items present in the contexts are ranked higher or not. Ideally all the
|
@@ -95,8 +95,7 @@ def ContextPrecision(
|
|
95
95
|
df = get_renamed_columns(dataset.df, required_columns)
|
96
96
|
|
97
97
|
result_df = evaluate(
|
98
|
-
Dataset.from_pandas(df),
|
99
|
-
metrics=[context_precision],
|
98
|
+
Dataset.from_pandas(df), metrics=[context_precision], **get_ragas_config()
|
100
99
|
).to_pandas()
|
101
100
|
|
102
101
|
fig_histogram = px.histogram(x=result_df["context_precision"].to_list(), nbins=10)
|
@@ -104,7 +103,7 @@ def ContextPrecision(
|
|
104
103
|
|
105
104
|
return (
|
106
105
|
{
|
107
|
-
"Scores": result_df[
|
106
|
+
"Scores (will not be uploaded to UI)": result_df[
|
108
107
|
["question", "contexts", "ground_truth", "context_precision"]
|
109
108
|
],
|
110
109
|
"Aggregate Scores": [
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_recall
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -95,8 +95,7 @@ def ContextRecall(
|
|
95
95
|
df = get_renamed_columns(dataset.df, required_columns)
|
96
96
|
|
97
97
|
result_df = evaluate(
|
98
|
-
Dataset.from_pandas(df),
|
99
|
-
metrics=[context_recall],
|
98
|
+
Dataset.from_pandas(df), metrics=[context_recall], **get_ragas_config()
|
100
99
|
).to_pandas()
|
101
100
|
|
102
101
|
fig_histogram = px.histogram(x=result_df["context_recall"].to_list(), nbins=10)
|
@@ -104,7 +103,7 @@ def ContextRecall(
|
|
104
103
|
|
105
104
|
return (
|
106
105
|
{
|
107
|
-
"Scores": result_df[
|
106
|
+
"Scores (will not be uploaded to UI)": result_df[
|
108
107
|
["question", "contexts", "ground_truth", "context_recall"]
|
109
108
|
],
|
110
109
|
"Aggregate Scores": [
|
@@ -11,7 +11,7 @@ from ragas.metrics import context_relevancy
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "retrieval_performance")
|
@@ -88,8 +88,7 @@ def ContextRelevancy(
|
|
88
88
|
df = get_renamed_columns(dataset.df, required_columns)
|
89
89
|
|
90
90
|
result_df = evaluate(
|
91
|
-
Dataset.from_pandas(df),
|
92
|
-
metrics=[context_relevancy],
|
91
|
+
Dataset.from_pandas(df), metrics=[context_relevancy], **get_ragas_config()
|
93
92
|
).to_pandas()
|
94
93
|
|
95
94
|
fig_histogram = px.histogram(x=result_df["context_relevancy"].to_list(), nbins=10)
|
@@ -97,7 +96,9 @@ def ContextRelevancy(
|
|
97
96
|
|
98
97
|
return (
|
99
98
|
{
|
100
|
-
"Scores": result_df[
|
99
|
+
"Scores (will not be uploaded to UI)": result_df[
|
100
|
+
["question", "contexts", "context_relevancy"]
|
101
|
+
],
|
101
102
|
"Aggregate Scores": [
|
102
103
|
{
|
103
104
|
"Mean Score": result_df["context_relevancy"].mean(),
|