validmind 2.2.6__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/{ai.py → ai/test_descriptions.py} +74 -82
  3. validmind/ai/utils.py +104 -0
  4. validmind/api_client.py +58 -19
  5. validmind/client.py +5 -5
  6. validmind/models/foundation.py +10 -6
  7. validmind/models/function.py +3 -1
  8. validmind/models/metadata.py +1 -1
  9. validmind/test_suites/__init__.py +1 -7
  10. validmind/test_suites/regression.py +0 -16
  11. validmind/test_suites/statsmodels_timeseries.py +1 -1
  12. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  13. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  14. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  15. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  16. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  17. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  18. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  19. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  20. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  21. validmind/tests/data_validation/ScatterPlot.py +1 -1
  22. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  23. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  24. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  25. validmind/tests/data_validation/WOEBinTable.py +1 -1
  26. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  27. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  28. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  29. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  30. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  31. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  32. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  33. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  34. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  35. validmind/tests/decorator.py +1 -1
  36. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  37. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  38. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  39. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  40. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  41. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  42. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  43. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  44. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  45. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  46. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  47. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  48. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  49. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  50. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  51. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  52. validmind/tests/model_validation/ragas/utils.py +35 -9
  53. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  54. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  55. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  56. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  57. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  58. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  59. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  60. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  61. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  62. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  63. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  64. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  65. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  66. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  67. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  68. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  69. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  70. validmind/tests/prompt_validation/Bias.py +14 -11
  71. validmind/tests/prompt_validation/Clarity.py +14 -11
  72. validmind/tests/prompt_validation/Conciseness.py +14 -11
  73. validmind/tests/prompt_validation/Delimitation.py +14 -11
  74. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  75. validmind/tests/prompt_validation/Robustness.py +11 -11
  76. validmind/tests/prompt_validation/Specificity.py +14 -11
  77. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  78. validmind/unit_metrics/composite.py +2 -1
  79. validmind/utils.py +4 -63
  80. validmind/vm_models/dataset/dataset.py +17 -3
  81. validmind/vm_models/dataset/utils.py +2 -2
  82. validmind/vm_models/model.py +1 -1
  83. validmind/vm_models/test/metric.py +1 -8
  84. validmind/vm_models/test/result_wrapper.py +2 -2
  85. validmind/vm_models/test/test.py +3 -0
  86. validmind/vm_models/test/threshold_test.py +1 -1
  87. validmind/vm_models/test_suite/runner.py +7 -4
  88. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
  89. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/RECORD +92 -101
  90. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  91. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  92. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  93. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  94. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  95. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  96. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  97. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  98. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  99. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  100. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
  101. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
  102. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -2,8 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import matplotlib.pyplot as plt
6
- import seaborn as sns
5
+ import pandas as pd
6
+ import plotly.graph_objects as go
7
7
 
8
8
  from validmind.vm_models import Figure, Metric
9
9
 
@@ -50,45 +50,41 @@ class TabularDateTimeHistograms(Metric):
50
50
 
51
51
  metadata = {
52
52
  "task_types": ["classification", "regression"],
53
- "tags": ["tabular_data", "visualization"],
53
+ "tags": ["time_series_data", "visualization"],
54
54
  }
55
55
 
56
56
  def run(self):
57
57
  df = self.inputs.dataset.df
58
58
 
59
- # Extract datetime columns from the dataset
60
- datetime_columns = df.select_dtypes(include=["datetime64"]).columns.tolist()
61
-
62
- if len(datetime_columns) == 0:
63
- raise ValueError("No datetime columns found in the dataset")
59
+ # Check if the index is a datetime type
60
+ if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
61
+ raise ValueError("Index must be a datetime type")
64
62
 
65
63
  figures = []
66
- for col in datetime_columns:
67
- plt.figure()
68
- fig, _ = plt.subplots()
69
-
70
- # Calculate the difference between consecutive dates and convert to days
71
- date_diffs = df[col].sort_values().diff().dt.days.dropna()
72
-
73
- # Filter out 0 values
74
- date_diffs = date_diffs[date_diffs != 0]
75
-
76
- ax = sns.histplot(date_diffs, kde=False, bins=30)
77
- plt.title(f"{col}", weight="bold", fontsize=20)
78
-
79
- plt.xticks(fontsize=18)
80
- plt.yticks(fontsize=18)
81
- ax.set_xlabel("Days Between Consecutive Dates", fontsize=18)
82
- ax.set_ylabel("Frequency", fontsize=18)
83
- figures.append(
84
- Figure(
85
- for_object=self,
86
- key=f"{self.key}:{col}",
87
- figure=fig,
88
- )
89
- )
90
64
 
91
- plt.close("all")
65
+ # Calculate the difference between consecutive dates in the index
66
+ date_diffs = df.index.to_series().sort_values().diff().dt.days.dropna()
67
+
68
+ # Filter out 0 values
69
+ date_diffs = date_diffs[date_diffs != 0]
70
+
71
+ # Create a histogram using Plotly
72
+ fig = go.Figure()
73
+ fig.add_trace(go.Histogram(x=date_diffs, nbinsx=30))
74
+ fig.update_layout(
75
+ title="Index",
76
+ xaxis_title="Days Between Consecutive Dates",
77
+ yaxis_title="Frequency",
78
+ font=dict(size=18),
79
+ )
80
+
81
+ figures.append(
82
+ Figure(
83
+ for_object=self,
84
+ key=f"{self.key}:index",
85
+ figure=fig,
86
+ )
87
+ )
92
88
 
93
89
  return self.cache_results(
94
90
  figures=figures,
@@ -58,7 +58,7 @@ class WOEBinPlots(Metric):
58
58
  """
59
59
 
60
60
  name = "woe_bin_plots"
61
- required_context = ["dataset"]
61
+ required_inputs = ["dataset"]
62
62
  default_params = {"breaks_adj": None, "fig_height": 600, "fig_width": 500}
63
63
  metadata = {
64
64
  "task_types": ["classification"],
@@ -46,7 +46,7 @@ class WOEBinTable(Metric):
46
46
  """
47
47
 
48
48
  name = "woe_bin_table"
49
- required_context = ["dataset"]
49
+ required_inputs = ["dataset"]
50
50
  default_params = {"breaks_adj": None}
51
51
  metadata = {
52
52
  "task_types": ["classification"],
@@ -4,9 +4,14 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
+ import pandas as pd
7
8
  from arch.unitroot import ZivotAndrews
9
+ from numpy.linalg import LinAlgError
8
10
 
9
- from validmind.vm_models import Metric
11
+ from validmind.logging import get_logger
12
+ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
13
+
14
+ logger = get_logger(__name__)
10
15
 
11
16
 
12
17
  @dataclass
@@ -57,14 +62,63 @@ class ZivotAndrewsArch(Metric):
57
62
  """
58
63
  dataset = self.inputs.dataset.df
59
64
 
60
- za_values = {}
65
+ # Check if the dataset is a time series
66
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
67
+ raise ValueError(
68
+ "Dataset index must be a datetime or period index for time series analysis."
69
+ )
70
+
71
+ # Preprocessing: Drop rows with any NaN values
72
+ if dataset.isnull().values.any():
73
+ logger.warning(
74
+ "Dataset contains missing values. Rows with NaNs will be dropped."
75
+ )
76
+ dataset = dataset.dropna()
77
+
78
+ # Convert to numeric and handle non-numeric data
79
+ dataset = dataset.apply(pd.to_numeric, errors="coerce")
80
+
81
+ # Initialize a list to store Zivot-Andrews results
82
+ za_values = []
83
+
61
84
  for col in dataset.columns:
62
- za = ZivotAndrews(dataset[col].values)
63
- za_values[col] = {
64
- "stat": za.stat,
65
- "pvalue": za.pvalue,
66
- "usedlag": za.lags,
67
- "nobs": za.nobs,
68
- }
69
-
70
- return self.cache_results(za_values)
85
+ try:
86
+ za = ZivotAndrews(dataset[col].values)
87
+ za_values.append(
88
+ {
89
+ "Variable": col,
90
+ "stat": za.stat,
91
+ "pvalue": za.pvalue,
92
+ "usedlag": za.lags,
93
+ "nobs": za.nobs,
94
+ }
95
+ )
96
+ except (LinAlgError, ValueError) as e:
97
+ logger.error(f"Error while processing column '{col}'. Details: {e}")
98
+ za_values.append(
99
+ {
100
+ "Variable": col,
101
+ "stat": None,
102
+ "pvalue": None,
103
+ "usedlag": None,
104
+ "nobs": None,
105
+ "error": str(e),
106
+ }
107
+ )
108
+
109
+ return self.cache_results({"zivot_andrews_results": za_values})
110
+
111
+ def summary(self, metric_value):
112
+ """
113
+ Build a table for summarizing the Zivot-Andrews results
114
+ """
115
+ za_results = metric_value["zivot_andrews_results"]
116
+
117
+ return ResultSummary(
118
+ results=[
119
+ ResultTable(
120
+ data=za_results,
121
+ metadata=ResultTableMetadata(title="Zivot-Andrews Test Results"),
122
+ )
123
+ ]
124
+ )
@@ -52,7 +52,7 @@ class CommonWords(Metric):
52
52
  """
53
53
 
54
54
  name = "common_words"
55
- required_inputs = ["dataset", "dataset.text_column"]
55
+ required_inputs = ["dataset"]
56
56
  metadata = {
57
57
  "task_types": ["text_classification", "text_summarization"],
58
58
  "tags": ["nlp", "text_data", "visualization", "frequency_analysis"],
@@ -54,7 +54,7 @@ class Hashtags(ThresholdTest):
54
54
  """
55
55
 
56
56
  name = "hashtags"
57
- required_inputs = ["dataset", "dataset.text_column"]
57
+ required_inputs = ["dataset"]
58
58
  default_params = {"top_hashtags": 25}
59
59
  metadata = {
60
60
  "task_types": ["text_classification", "text_summarization"],
@@ -54,7 +54,7 @@ class Mentions(ThresholdTest):
54
54
 
55
55
  name = "mentions"
56
56
 
57
- required_inputs = ["dataset", "dataset.text_column"]
57
+ required_inputs = ["dataset"]
58
58
  default_params = {"top_mentions": 25}
59
59
  metadata = {
60
60
  "task_types": ["text_classification", "text_summarization"],
@@ -10,7 +10,7 @@ from textblob import TextBlob
10
10
  from validmind import tags, tasks
11
11
 
12
12
 
13
- @tags("data_validation")
13
+ @tags("nlp", "text_data", "data_validation")
14
14
  @tasks("nlp")
15
15
  def PolarityAndSubjectivity(dataset):
16
16
  """
@@ -27,6 +27,7 @@ def PolarityAndSubjectivity(dataset):
27
27
  Returns:
28
28
  plotly.graph_objs._figure.Figure: A Plotly scatter plot of polarity vs subjectivity.
29
29
  """
30
+
30
31
  # Function to calculate sentiment and subjectivity
31
32
  def analyze_sentiment(text):
32
33
  analysis = TextBlob(text)
@@ -51,7 +51,7 @@ class Punctuations(Metric):
51
51
  """
52
52
 
53
53
  name = "punctuations"
54
- required_inputs = ["dataset", "dataset.text_column"]
54
+ required_inputs = ["dataset"]
55
55
  metadata = {
56
56
  "task_types": ["text_classification", "text_summarization"],
57
57
  "tags": ["nlp", "text_data", "visualization", "frequency_analysis"],
@@ -11,7 +11,7 @@ from nltk.sentiment import SentimentIntensityAnalyzer
11
11
  from validmind import tags, tasks
12
12
 
13
13
 
14
- @tags("data_validation")
14
+ @tags("nlp", "text_data", "data_validation")
15
15
  @tasks("nlp")
16
16
  def Sentiment(dataset):
17
17
  """
@@ -60,7 +60,7 @@ class TextDescription(Metric):
60
60
  """
61
61
 
62
62
  name = "text_description"
63
- required_inputs = ["dataset", "dataset.text_column"]
63
+ required_inputs = ["dataset"]
64
64
  default_params = {
65
65
  "unwanted_tokens": {
66
66
  "s",
@@ -79,6 +79,10 @@ class TextDescription(Metric):
79
79
  "num_top_words": 3,
80
80
  "lang": "english",
81
81
  }
82
+ metadata = {
83
+ "task_types": ["text_classification", "text_summarization"],
84
+ "tags": ["nlp", "text_data", "visualization"],
85
+ }
82
86
 
83
87
  def general_text_metrics(self, df, text_column):
84
88
  nltk.download("punkt", quiet=True)
@@ -9,7 +9,7 @@ import seaborn as sns
9
9
  from validmind import tags, tasks
10
10
 
11
11
 
12
- @tags("data_validation")
12
+ @tags("nlp", "text_data", "data_validation")
13
13
  @tasks("nlp")
14
14
  def Toxicity(dataset):
15
15
  """
@@ -13,9 +13,9 @@ from uuid import uuid4
13
13
 
14
14
  import pandas as pd
15
15
 
16
+ from validmind.ai.test_descriptions import get_description_metadata
16
17
  from validmind.errors import MissingRequiredTestInputError
17
18
  from validmind.logging import get_logger
18
- from validmind.utils import get_description_metadata
19
19
  from validmind.vm_models import (
20
20
  Metric,
21
21
  MetricResult,
@@ -55,10 +55,12 @@ class FeaturesAUC(Metric):
55
55
  }
56
56
 
57
57
  def run(self):
58
- x = self.inputs.dataset.x_df()
59
- y = self.inputs.dataset.y_df()
58
+ dataset = self.inputs.dataset
59
+ x = dataset.x_df()
60
+ y = dataset.y_df()
61
+ n_targets = dataset.df[dataset.target_column].nunique()
60
62
 
61
- if y.nunique() != 2:
63
+ if n_targets != 2:
62
64
  raise SkipTestError("FeaturesAUC metric requires a binary target variable.")
63
65
 
64
66
  aucs = pd.DataFrame(index=x.columns, columns=["AUC"])
@@ -9,7 +9,11 @@ import pandas as pd
9
9
  import plotly.express as px
10
10
  from sklearn.metrics.pairwise import cosine_similarity
11
11
 
12
+ from validmind import tags, tasks
12
13
 
14
+
15
+ @tags("visualization", "dimensionality_reduction", "embeddings")
16
+ @tasks("text_qa", "text_generation", "text_summarization")
13
17
  def CosineSimilarityComparison(dataset, models):
14
18
  """
15
19
  Computes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,
@@ -6,7 +6,11 @@ import numpy as np
6
6
  import plotly.express as px
7
7
  from sklearn.metrics.pairwise import cosine_similarity
8
8
 
9
+ from validmind import tags, tasks
9
10
 
11
+
12
+ @tags("visualization", "dimensionality_reduction", "embeddings")
13
+ @tasks("text_qa", "text_generation", "text_summarization")
10
14
  def CosineSimilarityHeatmap(
11
15
  dataset,
12
16
  model,
@@ -9,7 +9,11 @@ import pandas as pd
9
9
  import plotly.express as px
10
10
  from sklearn.metrics.pairwise import euclidean_distances
11
11
 
12
+ from validmind import tags, tasks
12
13
 
14
+
15
+ @tags("visualization", "dimensionality_reduction", "embeddings")
16
+ @tasks("text_qa", "text_generation", "text_summarization")
13
17
  def EuclideanDistanceComparison(dataset, models):
14
18
  """
15
19
  Computes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,
@@ -6,7 +6,11 @@ import numpy as np
6
6
  import plotly.express as px
7
7
  from sklearn.metrics.pairwise import euclidean_distances
8
8
 
9
+ from validmind import tags, tasks
9
10
 
11
+
12
+ @tags("visualization", "dimensionality_reduction", "embeddings")
13
+ @tasks("text_qa", "text_generation", "text_summarization")
10
14
  def EuclideanDistanceHeatmap(
11
15
  dataset,
12
16
  model,
@@ -10,7 +10,11 @@ import plotly.express as px
10
10
  from sklearn.decomposition import PCA
11
11
  from sklearn.preprocessing import StandardScaler
12
12
 
13
+ from validmind import tags, tasks
13
14
 
15
+
16
+ @tags("visualization", "dimensionality_reduction", "embeddings")
17
+ @tasks("text_qa", "text_generation", "text_summarization")
14
18
  def PCAComponentsPairwisePlots(dataset, model, n_components=3):
15
19
  """
16
20
  Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings.
@@ -10,7 +10,11 @@ import plotly.express as px
10
10
  from sklearn.manifold import TSNE
11
11
  from sklearn.preprocessing import StandardScaler
12
12
 
13
+ from validmind import tags, tasks
13
14
 
15
+
16
+ @tags("visualization", "dimensionality_reduction", "embeddings")
17
+ @tasks("text_qa", "text_generation", "text_summarization")
14
18
  def TSNEComponentsPairwisePlots(
15
19
  dataset,
16
20
  model,
@@ -11,7 +11,7 @@ from ragas.metrics import answer_correctness
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm")
@@ -104,7 +104,7 @@ def AnswerCorrectness(
104
104
  df = get_renamed_columns(dataset.df, required_columns)
105
105
 
106
106
  result_df = evaluate(
107
- Dataset.from_pandas(df), metrics=[answer_correctness]
107
+ Dataset.from_pandas(df), metrics=[answer_correctness], **get_ragas_config()
108
108
  ).to_pandas()
109
109
 
110
110
  fig_histogram = px.histogram(x=result_df["answer_correctness"].to_list(), nbins=10)
@@ -112,7 +112,7 @@ def AnswerCorrectness(
112
112
 
113
113
  return (
114
114
  {
115
- "Scores": result_df[
115
+ "Scores (will not be uploaded to UI)": result_df[
116
116
  ["question", "answer", "ground_truth", "answer_correctness"]
117
117
  ],
118
118
  "Aggregate Scores": [
@@ -11,7 +11,7 @@ from ragas.metrics import answer_relevancy
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm", "rag_performance")
@@ -108,8 +108,7 @@ def AnswerRelevance(
108
108
  df = get_renamed_columns(dataset.df, required_columns)
109
109
 
110
110
  result_df = evaluate(
111
- Dataset.from_pandas(df),
112
- metrics=[answer_relevancy],
111
+ Dataset.from_pandas(df), metrics=[answer_relevancy], **get_ragas_config()
113
112
  ).to_pandas()
114
113
 
115
114
  fig_histogram = px.histogram(x=result_df["answer_relevancy"].to_list(), nbins=10)
@@ -117,7 +116,9 @@ def AnswerRelevance(
117
116
 
118
117
  return (
119
118
  {
120
- "Scores": result_df[["question", "contexts", "answer", "answer_relevancy"]],
119
+ "Scores (will not be uploaded to UI)": result_df[
120
+ ["question", "contexts", "answer", "answer_relevancy"]
121
+ ],
121
122
  "Aggregate Scores": [
122
123
  {
123
124
  "Mean Score": result_df["answer_relevancy"].mean(),
@@ -11,7 +11,7 @@ from ragas.metrics import answer_similarity
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm")
@@ -93,8 +93,7 @@ def AnswerSimilarity(
93
93
  df = get_renamed_columns(dataset.df, required_columns)
94
94
 
95
95
  result_df = evaluate(
96
- Dataset.from_pandas(df),
97
- metrics=[answer_similarity],
96
+ Dataset.from_pandas(df), metrics=[answer_similarity], **get_ragas_config()
98
97
  ).to_pandas()
99
98
 
100
99
  fig_histogram = px.histogram(x=result_df["answer_similarity"].to_list(), nbins=10)
@@ -102,7 +101,9 @@ def AnswerSimilarity(
102
101
 
103
102
  return (
104
103
  {
105
- "Scores": result_df[["answer", "ground_truth", "answer_similarity"]],
104
+ "Scores (will not be uploaded to UI)": result_df[
105
+ ["answer", "ground_truth", "answer_similarity"]
106
+ ],
106
107
  "Aggregate Scores": [
107
108
  {
108
109
  "Mean Score": result_df["answer_similarity"].mean(),
@@ -18,7 +18,7 @@ from ragas.metrics.critique import (
18
18
 
19
19
  from validmind import tags, tasks
20
20
 
21
- from .utils import get_renamed_columns
21
+ from .utils import get_ragas_config, get_renamed_columns
22
22
 
23
23
  aspect_map = {
24
24
  "coherence": coherence,
@@ -36,14 +36,14 @@ def AspectCritique(
36
36
  question_column="question",
37
37
  answer_column="answer",
38
38
  contexts_column="contexts",
39
- aspects: list = [
39
+ aspects: list = [ # noqa: B006 this is fine as immutable default since it never gets modified
40
40
  "coherence",
41
41
  "conciseness",
42
42
  "correctness",
43
43
  "harmfulness",
44
44
  "maliciousness",
45
45
  ],
46
- additional_aspects: list = [],
46
+ additional_aspects: list = None,
47
47
  ):
48
48
  """
49
49
  Evaluates generations against the following aspects: harmfulness, maliciousness,
@@ -131,13 +131,19 @@ def AspectCritique(
131
131
  df = get_renamed_columns(dataset.df, required_columns)
132
132
 
133
133
  built_in_aspects = [aspect_map[aspect] for aspect in aspects]
134
- custom_aspects = [
135
- _AspectCritique(name=name, definition=description)
136
- for name, description in additional_aspects
137
- ]
134
+ custom_aspects = (
135
+ [
136
+ _AspectCritique(name=name, definition=description)
137
+ for name, description in additional_aspects
138
+ ]
139
+ if additional_aspects
140
+ else []
141
+ )
138
142
  all_aspects = [*built_in_aspects, *custom_aspects]
139
143
 
140
- result_df = evaluate(Dataset.from_pandas(df), metrics=all_aspects).to_pandas()
144
+ result_df = evaluate(
145
+ Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
146
+ ).to_pandas()
141
147
 
142
148
  df_melted = result_df.melt(
143
149
  id_vars=["question", "answer", "contexts"],
@@ -11,7 +11,7 @@ from ragas.metrics import context_entity_recall
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm", "retrieval_performance")
@@ -99,8 +99,7 @@ def ContextEntityRecall(
99
99
  df = get_renamed_columns(dataset.df, required_columns)
100
100
 
101
101
  result_df = evaluate(
102
- Dataset.from_pandas(df),
103
- metrics=[context_entity_recall],
102
+ Dataset.from_pandas(df), metrics=[context_entity_recall], **get_ragas_config()
104
103
  ).to_pandas()
105
104
 
106
105
  fig_histogram = px.histogram(
@@ -110,7 +109,7 @@ def ContextEntityRecall(
110
109
 
111
110
  return (
112
111
  {
113
- "Scores": result_df[
112
+ "Scores (will not be uploaded to UI)": result_df[
114
113
  [
115
114
  "contexts",
116
115
  "ground_truth",
@@ -11,7 +11,7 @@ from ragas.metrics import context_precision
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm", "retrieval_performance")
@@ -21,7 +21,7 @@ def ContextPrecision(
21
21
  question_column: str = "question",
22
22
  contexts_column: str = "contexts",
23
23
  ground_truth_column: str = "ground_truth",
24
- ):
24
+ ): # noqa: B950
25
25
  """
26
26
  Context Precision is a metric that evaluates whether all of the ground-truth
27
27
  relevant items present in the contexts are ranked higher or not. Ideally all the
@@ -95,8 +95,7 @@ def ContextPrecision(
95
95
  df = get_renamed_columns(dataset.df, required_columns)
96
96
 
97
97
  result_df = evaluate(
98
- Dataset.from_pandas(df),
99
- metrics=[context_precision],
98
+ Dataset.from_pandas(df), metrics=[context_precision], **get_ragas_config()
100
99
  ).to_pandas()
101
100
 
102
101
  fig_histogram = px.histogram(x=result_df["context_precision"].to_list(), nbins=10)
@@ -104,7 +103,7 @@ def ContextPrecision(
104
103
 
105
104
  return (
106
105
  {
107
- "Scores": result_df[
106
+ "Scores (will not be uploaded to UI)": result_df[
108
107
  ["question", "contexts", "ground_truth", "context_precision"]
109
108
  ],
110
109
  "Aggregate Scores": [
@@ -11,7 +11,7 @@ from ragas.metrics import context_recall
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm", "retrieval_performance")
@@ -95,8 +95,7 @@ def ContextRecall(
95
95
  df = get_renamed_columns(dataset.df, required_columns)
96
96
 
97
97
  result_df = evaluate(
98
- Dataset.from_pandas(df),
99
- metrics=[context_recall],
98
+ Dataset.from_pandas(df), metrics=[context_recall], **get_ragas_config()
100
99
  ).to_pandas()
101
100
 
102
101
  fig_histogram = px.histogram(x=result_df["context_recall"].to_list(), nbins=10)
@@ -104,7 +103,7 @@ def ContextRecall(
104
103
 
105
104
  return (
106
105
  {
107
- "Scores": result_df[
106
+ "Scores (will not be uploaded to UI)": result_df[
108
107
  ["question", "contexts", "ground_truth", "context_recall"]
109
108
  ],
110
109
  "Aggregate Scores": [
@@ -11,7 +11,7 @@ from ragas.metrics import context_relevancy
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm", "retrieval_performance")
@@ -88,8 +88,7 @@ def ContextRelevancy(
88
88
  df = get_renamed_columns(dataset.df, required_columns)
89
89
 
90
90
  result_df = evaluate(
91
- Dataset.from_pandas(df),
92
- metrics=[context_relevancy],
91
+ Dataset.from_pandas(df), metrics=[context_relevancy], **get_ragas_config()
93
92
  ).to_pandas()
94
93
 
95
94
  fig_histogram = px.histogram(x=result_df["context_relevancy"].to_list(), nbins=10)
@@ -97,7 +96,9 @@ def ContextRelevancy(
97
96
 
98
97
  return (
99
98
  {
100
- "Scores": result_df[["question", "contexts", "context_relevancy"]],
99
+ "Scores (will not be uploaded to UI)": result_df[
100
+ ["question", "contexts", "context_relevancy"]
101
+ ],
101
102
  "Aggregate Scores": [
102
103
  {
103
104
  "Mean Score": result_df["context_relevancy"].mean(),