validmind 2.2.6__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/{ai.py → ai/test_descriptions.py} +74 -82
  3. validmind/ai/utils.py +104 -0
  4. validmind/api_client.py +58 -19
  5. validmind/client.py +5 -5
  6. validmind/models/foundation.py +10 -6
  7. validmind/models/function.py +3 -1
  8. validmind/models/metadata.py +1 -1
  9. validmind/test_suites/__init__.py +1 -7
  10. validmind/test_suites/regression.py +0 -16
  11. validmind/test_suites/statsmodels_timeseries.py +1 -1
  12. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  13. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  14. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  15. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  16. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  17. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  18. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  19. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  20. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  21. validmind/tests/data_validation/ScatterPlot.py +1 -1
  22. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  23. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  24. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  25. validmind/tests/data_validation/WOEBinTable.py +1 -1
  26. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  27. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  28. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  29. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  30. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  31. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  32. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  33. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  34. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  35. validmind/tests/decorator.py +1 -1
  36. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  37. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  38. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  39. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  40. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  41. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  42. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  43. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  44. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  45. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  46. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  47. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  48. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  49. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  50. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  51. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  52. validmind/tests/model_validation/ragas/utils.py +35 -9
  53. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  54. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  55. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  56. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  57. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  58. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  59. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  60. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  61. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  62. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  63. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  64. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  65. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  66. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  67. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  68. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  69. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  70. validmind/tests/prompt_validation/Bias.py +14 -11
  71. validmind/tests/prompt_validation/Clarity.py +14 -11
  72. validmind/tests/prompt_validation/Conciseness.py +14 -11
  73. validmind/tests/prompt_validation/Delimitation.py +14 -11
  74. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  75. validmind/tests/prompt_validation/Robustness.py +11 -11
  76. validmind/tests/prompt_validation/Specificity.py +14 -11
  77. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  78. validmind/unit_metrics/composite.py +2 -1
  79. validmind/utils.py +4 -63
  80. validmind/vm_models/dataset/dataset.py +17 -3
  81. validmind/vm_models/dataset/utils.py +2 -2
  82. validmind/vm_models/model.py +1 -1
  83. validmind/vm_models/test/metric.py +1 -8
  84. validmind/vm_models/test/result_wrapper.py +2 -2
  85. validmind/vm_models/test/test.py +3 -0
  86. validmind/vm_models/test/threshold_test.py +1 -1
  87. validmind/vm_models/test_suite/runner.py +7 -4
  88. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
  89. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/RECORD +92 -101
  90. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  91. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  92. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  93. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  94. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  95. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  96. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  97. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  98. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  99. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  100. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
  101. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
  102. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -11,7 +11,7 @@ from ragas.metrics import faithfulness
11
11
 
12
12
  from validmind import tags, tasks
13
13
 
14
- from .utils import get_renamed_columns
14
+ from .utils import get_ragas_config, get_renamed_columns
15
15
 
16
16
 
17
17
  @tags("ragas", "llm", "rag_performance")
@@ -20,7 +20,7 @@ def Faithfulness(
20
20
  dataset,
21
21
  answer_column="answer",
22
22
  contexts_column="contexts",
23
- ):
23
+ ): # noqa
24
24
  """
25
25
  Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
26
26
 
@@ -93,8 +93,7 @@ def Faithfulness(
93
93
  df = get_renamed_columns(dataset.df, required_columns)
94
94
 
95
95
  result_df = evaluate(
96
- Dataset.from_pandas(df),
97
- metrics=[faithfulness],
96
+ Dataset.from_pandas(df), metrics=[faithfulness], **get_ragas_config()
98
97
  ).to_pandas()
99
98
 
100
99
  fig_histogram = px.histogram(x=result_df["faithfulness"].to_list(), nbins=10)
@@ -102,7 +101,9 @@ def Faithfulness(
102
101
 
103
102
  return (
104
103
  {
105
- "Scores": result_df[["contexts", "answer", "faithfulness"]],
104
+ "Scores (will not be uploaded to UI)": result_df[
105
+ ["contexts", "answer", "faithfulness"]
106
+ ],
106
107
  "Aggregate Scores": [
107
108
  {
108
109
  "Mean Score": result_df["faithfulness"].mean(),
@@ -2,17 +2,42 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ import os
5
6
 
6
- def _udf_get_sub_col(x, root_col, sub_col):
7
- if not isinstance(x, dict):
8
- raise TypeError(f"Expected a dictionary in column '{root_col}', got {type(x)}.")
7
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
9
8
 
10
- if sub_col not in x:
11
- raise KeyError(
12
- f"Sub-column '{sub_col}' not found in dictionary in column '{root_col}'."
13
- )
9
+ from validmind.ai.utils import get_client_and_model
14
10
 
15
- return x[sub_col]
11
+ EMBEDDINGS_MODEL = "text-embedding-3-small"
12
+
13
+
14
+ def get_ragas_config():
15
+ client, model = get_client_and_model()
16
+ os.environ["OPENAI_API_BASE"] = str(client.base_url)
17
+
18
+ return {
19
+ "llm": ChatOpenAI(api_key=client.api_key, model=model),
20
+ "embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
21
+ }
22
+
23
+
24
+ def make_sub_col_udf(root_col, sub_col):
25
+ """Create a udf that extracts sub-column values from a dictionary."""
26
+
27
+ def _udf_get_sub_col(x):
28
+ if not isinstance(x, dict):
29
+ raise TypeError(
30
+ f"Expected a dictionary in column '{root_col}', got {type(x)}."
31
+ )
32
+
33
+ if sub_col not in x:
34
+ raise KeyError(
35
+ f"Sub-column '{sub_col}' not found in dictionary in column '{root_col}'."
36
+ )
37
+
38
+ return x[sub_col]
39
+
40
+ return _udf_get_sub_col
16
41
 
17
42
 
18
43
  def get_renamed_columns(df, column_map):
@@ -34,6 +59,7 @@ def get_renamed_columns(df, column_map):
34
59
  Returns:
35
60
  pd.DataFrame: The DataFrame with columns renamed.
36
61
  """
62
+
37
63
  new_df = df.copy()
38
64
 
39
65
  for new_name, source in column_map.items():
@@ -50,7 +76,7 @@ def get_renamed_columns(df, column_map):
50
76
 
51
77
  if root_col in new_df.columns:
52
78
  new_df[new_name] = new_df[root_col].apply(
53
- lambda x: _udf_get_sub_col(x, root_col, sub_col)
79
+ make_sub_col_udf(root_col, sub_col)
54
80
  )
55
81
 
56
82
  else:
@@ -66,7 +66,7 @@ class ClusterPerformance(Metric):
66
66
  y_true_test = y_true_test.astype(y_pred_test.dtype).flatten()
67
67
  results = []
68
68
  for metric_name, metric_fcn in metric_info.items():
69
- for sample in samples:
69
+ for _ in samples:
70
70
  train_value = metric_fcn(list(y_true_train), y_pred_train)
71
71
  test_value = metric_fcn(list(y_true_test), y_pred_test)
72
72
  results.append(
@@ -85,7 +85,7 @@ class ClusterPerformance(Metric):
85
85
  """
86
86
  table_records = []
87
87
  for result in raw_results:
88
- for key, value in result.items():
88
+ for key, _ in result.items():
89
89
  table_records.append(
90
90
  {
91
91
  "Metric": key,
@@ -123,7 +123,7 @@ class ClusterPerformanceMetrics(ClusterPerformance):
123
123
  """
124
124
  table_records = []
125
125
  for result in raw_results:
126
- for key, value in result.items():
126
+ for key, _ in result.items():
127
127
  table_records.append(
128
128
  {
129
129
  "Metric": key,
@@ -52,7 +52,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
52
52
  """
53
53
 
54
54
  name = "models_performance_comparison"
55
- required_inputs = ["model", "models", "dataset"]
55
+ required_inputs = ["dataset", "models"]
56
56
  metadata = {
57
57
  "task_types": ["classification", "text_classification"],
58
58
  "tags": [
@@ -70,12 +70,12 @@ class ModelsPerformanceComparison(ClassifierPerformance):
70
70
  """
71
71
  results = []
72
72
  prf_table = []
73
- classes = {str(i) for i in unique(self.y_true())}
73
+ classes = {str(i) for i in unique(self.inputs.dataset.y)}
74
74
 
75
75
  for class_name in classes:
76
76
  prf_dict = {}
77
77
  prf_dict["Class"] = class_name
78
- for m, m_v in metric_value.items():
78
+ for m, _ in metric_value.items():
79
79
  prf_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
80
80
  prf_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
81
81
  prf_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
@@ -85,7 +85,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
85
85
  for class_name in avg_metrics:
86
86
  avg_dict = {}
87
87
  avg_dict["Class"] = class_name
88
- for m, m_v in metric_value.items():
88
+ for m, _ in metric_value.items():
89
89
  avg_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
90
90
  avg_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
91
91
  avg_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
@@ -103,7 +103,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
103
103
  for metric_name in ["accuracy", "roc_auc"]:
104
104
  acc_roc_auc_dict = {}
105
105
  acc_roc_auc_dict["Metric"] = metric_name
106
- for m, m_v in metric_value.items():
106
+ for m, _ in metric_value.items():
107
107
  acc_roc_auc_dict[f"accuracy- {m}"] = metric_value[m]["accuracy"]
108
108
  acc_roc_auc_dict[f"roc_auc- {m}"] = metric_value[m]["roc_auc"]
109
109
  acc_roc_auc_table.append(acc_roc_auc_dict)
@@ -122,10 +122,8 @@ class ModelsPerformanceComparison(ClassifierPerformance):
122
122
  "List of models must be provided as a `models` parameter to compare performance"
123
123
  )
124
124
 
125
- all_models = [self.inputs.model]
125
+ all_models = self.inputs.models
126
126
 
127
- if self.inputs.models is not None:
128
- all_models.extend(self.inputs.models)
129
127
  results = {}
130
128
  for idx, model in enumerate(all_models):
131
129
  y_true = self.inputs.dataset.y
@@ -57,7 +57,7 @@ class RegressionErrors(Metric):
57
57
  """
58
58
  table_records = []
59
59
  for result in raw_results:
60
- for key, value in result.items():
60
+ for key, _ in result.items():
61
61
  table_records.append(
62
62
  {
63
63
  "Metric": key,
@@ -9,8 +9,11 @@ import numpy as np
9
9
  from sklearn.metrics import mean_absolute_error, mean_squared_error
10
10
 
11
11
  from validmind.errors import SkipTestError
12
+ from validmind.logging import get_logger
12
13
  from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
13
14
 
15
+ logger = get_logger(__name__)
16
+
14
17
 
15
18
  @dataclass
16
19
  class RegressionModelsPerformanceComparison(Metric):
@@ -56,7 +59,7 @@ class RegressionModelsPerformanceComparison(Metric):
56
59
  """
57
60
 
58
61
  name = "models_performance_comparison"
59
- required_inputs = ["model", "dataset"]
62
+ required_inputs = ["dataset", "models"]
60
63
 
61
64
  metadata = {
62
65
  "task_types": ["regression"],
@@ -76,8 +79,14 @@ class RegressionModelsPerformanceComparison(Metric):
76
79
  results["Mean Squared Error (MSE)"] = mse_test
77
80
  results["Root Mean Squared Error (RMSE)"] = np.sqrt(mse_test)
78
81
 
79
- mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100
80
- results["Mean Absolute Percentage Error (MAPE)"] = mape_test
82
+ if np.any(y_true_test == 0):
83
+ logger.warning(
84
+ "y_true_test contains zero values. Skipping MAPE calculation to avoid division by zero."
85
+ )
86
+ results["Mean Absolute Percentage Error (MAPE)"] = None
87
+ else:
88
+ mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100
89
+ results["Mean Absolute Percentage Error (MAPE)"] = mape_test
81
90
 
82
91
  mbd_test = np.mean(y_pred_test - y_true_test)
83
92
  results["Mean Bias Deviation (MBD)"] = mbd_test
@@ -94,7 +103,7 @@ class RegressionModelsPerformanceComparison(Metric):
94
103
  for metric_name in metrics:
95
104
  errors_dict = {}
96
105
  errors_dict["Errors"] = metric_name
97
- for m, m_v in metric_value.items():
106
+ for m, _ in metric_value.items():
98
107
  for metric in metrics:
99
108
  res = re.findall(r"\(.*?\)", metric)
100
109
  res[0][1:-1]
@@ -117,10 +126,7 @@ class RegressionModelsPerformanceComparison(Metric):
117
126
  "List of models must be provided as a `models` parameter to compare performance"
118
127
  )
119
128
 
120
- all_models = [self.inputs.model]
121
-
122
- if self.inputs.models is not None:
123
- all_models.extend(self.inputs.models)
129
+ all_models = self.inputs.models
124
130
 
125
131
  results = {}
126
132
 
@@ -57,7 +57,7 @@ class RegressionR2Square(Metric):
57
57
  """
58
58
  table_records = []
59
59
  for result in raw_results:
60
- for key, value in result.items():
60
+ for key, _ in result.items():
61
61
  table_records.append(
62
62
  {
63
63
  "Metric": key,
@@ -53,7 +53,7 @@ class DurbinWatsonTest(Metric):
53
53
  """
54
54
  Calculates DB for each of the dataset features
55
55
  """
56
- x_train = self.train_ds.df
56
+ x_train = self.inputs.dataset.df
57
57
  dw_values = {}
58
58
  for col in x_train.columns:
59
59
  dw_values[col] = durbin_watson(x_train[col].values)
@@ -80,7 +80,7 @@ class GINITable(Metric):
80
80
  metrics_dict = {"Dataset": [], "AUC": [], "GINI": [], "KS": []}
81
81
 
82
82
  # Iterate over each dataset in the inputs
83
- for i, dataset in enumerate(self.inputs.datasets):
83
+ for _, dataset in enumerate(self.inputs.datasets):
84
84
  dataset_label = (
85
85
  dataset.input_id
86
86
  ) # Use input_id as the label for each dataset
@@ -59,7 +59,7 @@ class JarqueBera(Metric):
59
59
  """
60
60
  Calculates JB for each of the dataset features
61
61
  """
62
- x_train = self.inputs.dataset.df
62
+ x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
63
63
 
64
64
  jb_values = {}
65
65
  for col in x_train.columns:
@@ -87,7 +87,7 @@ class KolmogorovSmirnov(Metric):
87
87
  if data_distribution not in ["norm" or "exp"]:
88
88
  InvalidTestParametersError("Dist parameter must be either 'norm' or 'exp'")
89
89
 
90
- x_train = self.inputs.dataset.df
90
+ x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
91
91
  ks_values = {}
92
92
  for col in x_train.columns:
93
93
  ks_stat, p_value = kstest_normal(x_train[col].values, data_distribution)
@@ -54,7 +54,7 @@ class LJungBox(Metric):
54
54
  """
55
55
  Calculates Ljung-Box test for each of the dataset features
56
56
  """
57
- x_train = self.train_ds.df
57
+ x_train = self.inputs.dataset.df
58
58
 
59
59
  ljung_box_values = {}
60
60
  for col in x_train.columns:
@@ -70,7 +70,7 @@ class Lilliefors(Metric):
70
70
  """
71
71
  Calculates Lilliefors test for each of the dataset features
72
72
  """
73
- x_train = self.train_ds.df
73
+ x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
74
74
 
75
75
  lilliefors_values = {}
76
76
  for col in x_train.columns:
@@ -8,6 +8,7 @@ import pandas as pd
8
8
  import plotly.graph_objects as go
9
9
  from scipy import stats
10
10
 
11
+ from validmind.errors import SkipTestError
11
12
  from validmind.vm_models import Figure, Metric
12
13
 
13
14
 
@@ -115,6 +116,9 @@ class RegressionCoeffsPlot(Metric):
115
116
  all_models.extend(self.inputs.models)
116
117
 
117
118
  for i, model in enumerate(all_models):
119
+ if model.library != "statsmodels":
120
+ raise SkipTestError("Only statsmodels are supported for this metric")
121
+
118
122
  model_name = f"Model {i+1}"
119
123
 
120
124
  fig, metric_values = self.plot_coefficients_with_ci(model, model_name)
@@ -7,6 +7,7 @@ from dataclasses import dataclass
7
7
  import matplotlib.pyplot as plt
8
8
  import seaborn as sns
9
9
 
10
+ from validmind.errors import SkipTestError
10
11
  from validmind.logging import get_logger
11
12
  from validmind.vm_models import Figure, Metric
12
13
 
@@ -82,10 +83,14 @@ class RegressionFeatureSignificance(Metric):
82
83
  # Initialize a list to store figures
83
84
  figures = []
84
85
 
85
- for i, fitted_model in enumerate(model_list):
86
+ for i, model in enumerate(model_list):
87
+
88
+ if model.library != "statsmodels":
89
+ raise SkipTestError("Only statsmodels are supported for this metric")
90
+
86
91
  # Get the coefficients and p-values from the model
87
- coefficients = fitted_model.model.params
88
- pvalues = fitted_model.model.pvalues
92
+ coefficients = model.model.params
93
+ pvalues = model.model.pvalues
89
94
 
90
95
  # Sort the variables by p-value in ascending order
91
96
  sorted_idx = pvalues.argsort()
@@ -122,7 +127,7 @@ class RegressionFeatureSignificance(Metric):
122
127
  for_object=self,
123
128
  key=f"{self.key}:{i}",
124
129
  figure=fig,
125
- metadata={"model": str(fitted_model.model)},
130
+ metadata={"model": str(model.model)},
126
131
  )
127
132
  )
128
133
  plt.close("all")
@@ -73,9 +73,9 @@ class RegressionModelsCoeffs(Metric):
73
73
  raise ValueError("List of models must be provided in the models parameter")
74
74
 
75
75
  for model in self.inputs.models:
76
- if model.class_ != "statsmodels" and model.class_ != "R":
76
+ if model.library != "statsmodels":
77
77
  raise SkipTestError(
78
- "Only statsmodels and R models are supported for this metric"
78
+ "Only statsmodels models are supported for this metric"
79
79
  )
80
80
 
81
81
  coefficients = [m.regression_coefficients() for m in self.inputs.models]
@@ -59,7 +59,7 @@ class RunsTest(Metric):
59
59
  """
60
60
  Calculates the run test for each of the dataset features
61
61
  """
62
- x_train = self.inputs.dataset.df
62
+ x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
63
63
 
64
64
  runs_test_values = {}
65
65
  for col in x_train.columns:
@@ -53,7 +53,7 @@ class ShapiroWilk(Metric):
53
53
  """
54
54
  Calculates Shapiro-Wilk test for each of the dataset features.
55
55
  """
56
- x_train = self.inputs.dataset.df
56
+ x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
57
57
  sw_values = {}
58
58
  for col in x_train.columns:
59
59
  sw_stat, sw_pvalue = stats.shapiro(x_train[col].values)
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Bias(ThresholdTest, AIPoweredTest):
28
+ class Bias(ThresholdTest):
23
29
  """
24
30
  Evaluates bias in a Large Language Model based on the order and distribution of exemplars in a prompt.
25
31
 
@@ -103,12 +109,6 @@ Prompt:
103
109
  """
104
110
  '''.strip()
105
111
 
106
- def __init__(self, *args, **kwargs):
107
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
108
- AIPoweredTest.__init__(
109
- self, *args, **kwargs
110
- ) # Explicitly call AIPoweredTest.__init__
111
-
112
112
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
113
113
  result = results[0]
114
114
  results_table = [
@@ -132,14 +132,17 @@ Prompt:
132
132
  )
133
133
 
134
134
  def run(self):
135
- response = self.call_model(
135
+ if not hasattr(self.inputs.model, "prompt"):
136
+ raise MissingRequiredTestInputError(missing_prompt_message)
137
+
138
+ response = call_model(
136
139
  system_prompt=self.system_prompt,
137
140
  user_prompt=self.user_prompt.format(
138
141
  prompt_to_test=self.inputs.model.prompt.template
139
142
  ),
140
143
  )
141
- score = self.get_score(response)
142
- explanation = self.get_explanation(response)
144
+ score = get_score(response)
145
+ explanation = get_explanation(response)
143
146
 
144
147
  passed = score > self.params["min_threshold"]
145
148
  results = [
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Clarity(ThresholdTest, AIPoweredTest):
28
+ class Clarity(ThresholdTest):
23
29
  """
24
30
  Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
25
31
 
@@ -93,12 +99,6 @@ Prompt:
93
99
  """
94
100
  '''.strip()
95
101
 
96
- def __init__(self, *args, **kwargs):
97
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
98
- AIPoweredTest.__init__(
99
- self, *args, **kwargs
100
- ) # Explicitly call AIPoweredTest.__init__
101
-
102
102
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
103
103
  result = results[0]
104
104
  results_table = [
@@ -122,14 +122,17 @@ Prompt:
122
122
  )
123
123
 
124
124
  def run(self):
125
- response = self.call_model(
125
+ if not hasattr(self.inputs.model, "prompt"):
126
+ raise MissingRequiredTestInputError(missing_prompt_message)
127
+
128
+ response = call_model(
126
129
  system_prompt=self.system_prompt,
127
130
  user_prompt=self.user_prompt.format(
128
131
  prompt_to_test=self.inputs.model.prompt.template
129
132
  ),
130
133
  )
131
- score = self.get_score(response)
132
- explanation = self.get_explanation(response)
134
+ score = get_score(response)
135
+ explanation = get_explanation(response)
133
136
 
134
137
  passed = score > self.params["min_threshold"]
135
138
  results = [
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Conciseness(ThresholdTest, AIPoweredTest):
28
+ class Conciseness(ThresholdTest):
23
29
  """
24
30
  Analyzes and grades the conciseness of prompts provided to a Large Language Model.
25
31
 
@@ -95,12 +101,6 @@ Prompt:
95
101
  """
96
102
  '''.strip()
97
103
 
98
- def __init__(self, *args, **kwargs):
99
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
100
- AIPoweredTest.__init__(
101
- self, *args, **kwargs
102
- ) # Explicitly call AIPoweredTest.__init__
103
-
104
104
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
105
105
  result = results[0]
106
106
  results_table = [
@@ -124,14 +124,17 @@ Prompt:
124
124
  )
125
125
 
126
126
  def run(self):
127
- response = self.call_model(
127
+ if not hasattr(self.inputs.model, "prompt"):
128
+ raise MissingRequiredTestInputError(missing_prompt_message)
129
+
130
+ response = call_model(
128
131
  system_prompt=self.system_prompt,
129
132
  user_prompt=self.user_prompt.format(
130
133
  prompt_to_test=self.inputs.model.prompt.template
131
134
  ),
132
135
  )
133
- score = self.get_score(response)
134
- explanation = self.get_explanation(response)
136
+ score = get_score(response)
137
+ explanation = get_explanation(response)
135
138
 
136
139
  passed = score > self.params["min_threshold"]
137
140
  results = [
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Delimitation(ThresholdTest, AIPoweredTest):
28
+ class Delimitation(ThresholdTest):
23
29
  """
24
30
  Evaluates the proper use of delimiters in prompts provided to Large Language Models.
25
31
 
@@ -85,12 +91,6 @@ Prompt:
85
91
  """
86
92
  '''.strip()
87
93
 
88
- def __init__(self, *args, **kwargs):
89
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
90
- AIPoweredTest.__init__(
91
- self, *args, **kwargs
92
- ) # Explicitly call AIPoweredTest.__init__
93
-
94
94
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
95
95
  result = results[0]
96
96
  results_table = [
@@ -114,14 +114,17 @@ Prompt:
114
114
  )
115
115
 
116
116
  def run(self):
117
- response = self.call_model(
117
+ if not hasattr(self.inputs.model, "prompt"):
118
+ raise MissingRequiredTestInputError(missing_prompt_message)
119
+
120
+ response = call_model(
118
121
  system_prompt=self.system_prompt,
119
122
  user_prompt=self.user_prompt.format(
120
123
  prompt_to_test=self.inputs.model.prompt.template
121
124
  ),
122
125
  )
123
- score = self.get_score(response)
124
- explanation = self.get_explanation(response)
126
+ score = get_score(response)
127
+ explanation = get_explanation(response)
125
128
 
126
129
  passed = score > self.params["min_threshold"]
127
130
  results = [