validmind 2.0.0__py3-none-any.whl → 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. validmind/__init__.py +4 -1
  2. validmind/__version__.py +1 -1
  3. validmind/ai.py +197 -0
  4. validmind/api_client.py +16 -4
  5. validmind/client.py +23 -3
  6. validmind/datasets/classification/customer_churn.py +2 -2
  7. validmind/datasets/nlp/__init__.py +5 -0
  8. validmind/datasets/nlp/cnn_dailymail.py +98 -0
  9. validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
  10. validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
  11. validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
  12. validmind/errors.py +11 -1
  13. validmind/models/huggingface.py +2 -2
  14. validmind/models/pytorch.py +3 -3
  15. validmind/models/sklearn.py +4 -4
  16. validmind/tests/__init__.py +47 -9
  17. validmind/tests/data_validation/DatasetDescription.py +0 -1
  18. validmind/tests/data_validation/PiTCreditScoresHistogram.py +8 -3
  19. validmind/tests/data_validation/TargetRateBarPlots.py +3 -1
  20. validmind/tests/data_validation/nlp/StopWords.py +1 -6
  21. validmind/tests/data_validation/nlp/TextDescription.py +20 -9
  22. validmind/tests/decorator.py +189 -0
  23. validmind/tests/model_validation/MeteorScore.py +92 -0
  24. validmind/tests/model_validation/RegardHistogram.py +5 -6
  25. validmind/tests/model_validation/RegardScore.py +3 -5
  26. validmind/tests/model_validation/RougeMetrics.py +6 -4
  27. validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
  28. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
  29. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +3 -1
  30. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +30 -4
  31. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -3
  32. validmind/tests/model_validation/statsmodels/ADF.py +27 -1
  33. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  34. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +1 -13
  35. validmind/tests/prompt_validation/ai_powered_test.py +2 -0
  36. validmind/unit_metrics/__init__.py +0 -2
  37. validmind/unit_metrics/composite.py +275 -0
  38. validmind/unit_metrics/regression/GiniCoefficient.py +39 -0
  39. validmind/unit_metrics/regression/HuberLoss.py +27 -0
  40. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +36 -0
  41. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +22 -0
  42. validmind/unit_metrics/regression/MeanBiasDeviation.py +22 -0
  43. validmind/unit_metrics/regression/QuantileLoss.py +25 -0
  44. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +27 -0
  45. validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +22 -0
  46. validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +22 -0
  47. validmind/unit_metrics/regression/sklearn/RSquaredScore.py +22 -0
  48. validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +23 -0
  49. validmind/unit_metrics/sklearn/classification/Accuracy.py +2 -0
  50. validmind/unit_metrics/sklearn/classification/F1.py +2 -0
  51. validmind/unit_metrics/sklearn/classification/Precision.py +2 -0
  52. validmind/unit_metrics/sklearn/classification/ROC_AUC.py +2 -0
  53. validmind/unit_metrics/sklearn/classification/Recall.py +2 -0
  54. validmind/utils.py +17 -1
  55. validmind/vm_models/dataset.py +376 -21
  56. validmind/vm_models/figure.py +52 -17
  57. validmind/vm_models/test/metric.py +33 -30
  58. validmind/vm_models/test/output_template.py +0 -27
  59. validmind/vm_models/test/result_wrapper.py +57 -24
  60. validmind/vm_models/test/test.py +2 -1
  61. validmind/vm_models/test/threshold_test.py +24 -13
  62. validmind/vm_models/test_context.py +7 -0
  63. validmind/vm_models/test_suite/runner.py +1 -1
  64. validmind/vm_models/test_suite/test.py +1 -1
  65. {validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/METADATA +9 -13
  66. {validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/RECORD +69 -48
  67. validmind-2.0.7.dist-info/entry_points.txt +3 -0
  68. {validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/LICENSE +0 -0
  69. {validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/WHEEL +0 -0
@@ -59,21 +59,19 @@ class RegardScore(Metric):
59
59
 
60
60
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
61
61
  y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
62
- input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
63
62
 
64
- if not len(y_true) == len(y_pred) == len(input_text):
63
+ if not len(y_true) == len(y_pred):
65
64
  raise ValueError(
66
65
  "Inconsistent lengths among input text, true summaries, and predicted summaries."
67
66
  )
68
67
 
69
- return input_text, y_true, y_pred
68
+ return y_true, y_pred
70
69
 
71
70
  def regard_line_plot(self):
72
71
  regard_tool = evaluate.load("regard")
73
- input_text, y_true, y_pred = self._get_datasets()
72
+ y_true, y_pred = self._get_datasets()
74
73
 
75
74
  dataframes = {
76
- "Input Text": input_text,
77
75
  "Target Text": y_true,
78
76
  "Predicted Summaries": y_pred,
79
77
  }
@@ -76,7 +76,6 @@ class RougeMetrics(Metric):
76
76
  if r_metrics is None:
77
77
  raise ValueError("rouge_metrics must be provided in params")
78
78
 
79
- # With all
80
79
  if not (
81
80
  set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
82
81
  == set(r_metrics)
@@ -97,12 +96,13 @@ class RougeMetrics(Metric):
97
96
 
98
97
  metrics_df = pd.DataFrame(score_list)
99
98
  figures = []
99
+
100
100
  for m in metrics_df.columns:
101
101
  df_scores = pd.DataFrame(metrics_df[m].tolist())
102
102
  # Visualization part
103
103
  fig = go.Figure()
104
104
 
105
- # Adding the line plots
105
+ # Adding the line plots for precision, recall, and F1-score with lines and markers
106
106
  fig.add_trace(
107
107
  go.Scatter(
108
108
  x=df_scores.index,
@@ -129,11 +129,13 @@ class RougeMetrics(Metric):
129
129
  )
130
130
 
131
131
  fig.update_layout(
132
- title="ROUGE Scores for Each Row",
132
+ title=f"ROUGE Scores for {m}",
133
133
  xaxis_title="Row Index",
134
134
  yaxis_title="Score",
135
135
  )
136
- k = m.replace("-", "")
136
+
137
+ # Ensure a unique key for each metric
138
+ k = f"{m.replace('-', '')}_{len(figures)}"
137
139
  figures.append(
138
140
  Figure(
139
141
  for_object=self,
@@ -0,0 +1,112 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import itertools
6
+ from dataclasses import dataclass
7
+
8
+ import pandas as pd
9
+ import plotly.graph_objects as go
10
+ import torch
11
+ from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
12
+ from tqdm import tqdm
13
+
14
+ from validmind.vm_models import Figure, Metric
15
+
16
+
17
+ @dataclass
18
+ class SelfCheckNLIScore(Metric):
19
+ """
20
+ Evaluates text generation models' performance by quantifying the level of hallucination in generated texts compared to reference texts.
21
+
22
+ **Purpose**: The HallucinationScore metric is designed to assess the factual accuracy and reliability of text generated by models, focusing on the detection and quantification of hallucinations—instances where generated content deviates from factual or expected outputs. By comparing generated texts against reference texts, this metric highlights discrepancies indicative of hallucinations, offering insights into the model's ability to produce contextually and factually coherent content.
23
+
24
+ **Test Mechanism**: To compute the HallucinationScore, the metric employs a comparison between the generated texts (model predictions) and the provided reference texts (true values). Using the SelfCheckNLI model, it evaluates each generated text's level of factual congruence with the reference, assigning a hallucination score based on the semantic coherence and factual accuracy. The scores for each text instance are then visualized in a line plot, allowing for the examination of hallucination trends across the dataset.
25
+
26
+ **Signs of High Risk**:
27
+ - High hallucination scores across a significant portion of the dataset, indicating a prevalence of factually inaccurate or irrelevant content generation.
28
+ - Patterns of consistent hallucination in specific contexts or subjects, suggesting gaps in the model's understanding or knowledge.
29
+ - Sharp fluctuations in hallucination scores, which may reveal inconsistencies in the model's performance or sensitivity to certain types of input.
30
+
31
+ **Strengths**:
32
+ - Directly addresses the critical aspect of factual accuracy in generated text, beyond mere linguistic or stylistic coherence.
33
+ - Provides a granular, instance-by-instance analysis of model performance, allowing for targeted improvements and diagnostics.
34
+ - Facilitates a deeper understanding of a model's capabilities and limitations in producing reliable and accurate content.
35
+
36
+ **Limitations**:
37
+ - Reliance on the SelfCheckNLI model means the accuracy and effectiveness of the HallucinationScore are contingent upon the performance and suitability of the underlying NLI model.
38
+ - May not fully capture the subtleties of certain factual inaccuracies or the contextual relevance of reference texts, especially in complex or nuanced domains.
39
+ - Potentially resource-intensive, given the computational demands of running advanced NLI models for large datasets.
40
+ """
41
+
42
+ name = "self_check_nli_score"
43
+ required_inputs = ["model", "dataset"]
44
+
45
+ def run(self):
46
+ # Assuming the dataset is structured with generated sentences and reference samples
47
+ y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
48
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
49
+
50
+ hallucination_scores = self.compute_hallucination_scores(y_pred, y_true)
51
+
52
+ # Visualization of scores
53
+ figures = self.visualize_scores(hallucination_scores)
54
+
55
+ return self.cache_results(figures=figures)
56
+
57
+ def compute_hallucination_scores(self, predictions, references):
58
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+ selfcheck_nli = SelfCheckNLI(device=device)
60
+ hallucination_scores = []
61
+
62
+ print("Starting hallucination score computation...")
63
+
64
+ for index, (sentences, samples) in enumerate(
65
+ tqdm(zip(predictions, references), total=len(predictions))
66
+ ):
67
+ sent_scores_nli = selfcheck_nli.predict(
68
+ sentences=sentences, sampled_passages=samples
69
+ )
70
+
71
+ # Compute the mean of the hallucination scores for this row
72
+ average_score = sent_scores_nli.mean()
73
+ hallucination_scores.append(average_score)
74
+
75
+ # Print a progress update for each row
76
+ print(
77
+ f"Row {index + 1}/{len(predictions)}: Average hallucination score: {average_score}"
78
+ )
79
+
80
+ print("Completed hallucination score computation.")
81
+
82
+ return hallucination_scores
83
+
84
+ def visualize_scores(self, scores):
85
+ scores_df = pd.DataFrame(scores, columns=["Hallucination Score"])
86
+
87
+ fig = go.Figure()
88
+ fig.add_trace(
89
+ go.Scatter(
90
+ x=scores_df.index,
91
+ y=scores_df["Hallucination Score"],
92
+ mode="lines+markers",
93
+ name="Hallucination Score",
94
+ )
95
+ )
96
+
97
+ fig.update_layout(
98
+ title="Hallucination Scores Across Text Instances",
99
+ xaxis_title="Text Instance Index",
100
+ yaxis_title="Hallucination Score",
101
+ )
102
+
103
+ # Wrapping the plotly figure for compatibility with your framework might be needed
104
+ figures = [
105
+ Figure(
106
+ for_object=self,
107
+ key=self.key,
108
+ figure=fig,
109
+ )
110
+ ]
111
+
112
+ return figures
@@ -59,30 +59,25 @@ class DescriptiveAnalytics(Metric):
59
59
  }
60
60
 
61
61
  def run(self):
62
- mean = np.mean(self.inputs.dataset.y_pred(self.inputs.model.input_id))
63
- median = np.median(self.inputs.dataset.y_pred(self.inputs.model.input_id))
64
- std = np.std(self.inputs.dataset.y_pred(self.inputs.model.input_id))
62
+ # Assuming y_pred returns a 2D array of embeddings [samples, features]
63
+ preds = self.inputs.dataset.y_pred(self.inputs.model.input_id)
64
+
65
+ # Calculate statistics across the embedding dimensions, not across all embeddings
66
+ means = np.mean(preds, axis=0) # Mean of each feature across all samples
67
+ medians = np.median(preds, axis=0) # Median of each feature across all samples
68
+ stds = np.std(preds, axis=0) # Std. dev. of each feature across all samples
69
+
70
+ # Plot histograms of the calculated statistics
71
+ mean_fig = px.histogram(x=means, title="Distribution of Embedding Means")
72
+ median_fig = px.histogram(x=medians, title="Distribution of Embedding Medians")
73
+ std_fig = px.histogram(
74
+ x=stds, title="Distribution of Embedding Standard Deviations"
75
+ )
65
76
 
66
77
  return self.cache_results(
67
78
  figures=[
68
- Figure(
69
- for_object=self,
70
- key=self.key,
71
- figure=px.histogram(mean, title="Distribution of Embedding Means"),
72
- ),
73
- Figure(
74
- for_object=self,
75
- key=self.key,
76
- figure=px.histogram(
77
- median, title="Distribution of Embedding Medians"
78
- ),
79
- ),
80
- Figure(
81
- for_object=self,
82
- key=self.key,
83
- figure=px.histogram(
84
- std, title="Distribution of Embedding Standard Deviations"
85
- ),
86
- ),
79
+ Figure(for_object=self, key=f"{self.key}_mean", figure=mean_fig),
80
+ Figure(for_object=self, key=f"{self.key}_median", figure=median_fig),
81
+ Figure(for_object=self, key=f"{self.key}_std", figure=std_fig),
87
82
  ],
88
83
  )
@@ -131,7 +131,9 @@ class ClassifierPerformance(Metric):
131
131
  y_true = self.y_true()
132
132
  class_pred = self.y_pred()
133
133
 
134
- report = metrics.classification_report(y_true, class_pred, output_dict=True)
134
+ report = metrics.classification_report(
135
+ y_true, class_pred, output_dict=True, zero_division=0
136
+ )
135
137
  report["roc_auc"] = multiclass_roc_auc_score(y_true, class_pred)
136
138
 
137
139
  return self.cache_results(report)
@@ -8,6 +8,7 @@ from dataclasses import dataclass
8
8
  import matplotlib.pyplot as plt
9
9
  import shap
10
10
 
11
+ from validmind.errors import UnsupportedModelForSHAPError
11
12
  from validmind.logging import get_logger
12
13
  from validmind.vm_models import Figure, Metric
13
14
 
@@ -72,6 +73,9 @@ class SHAPGlobalImportance(Metric):
72
73
  "visualization",
73
74
  ],
74
75
  }
76
+ default_params = {
77
+ "kernel_explainer_samples": 10,
78
+ }
75
79
 
76
80
  def _generate_shap_plot(self, type_, shap_values, x_test):
77
81
  """
@@ -127,22 +131,44 @@ class SHAPGlobalImportance(Metric):
127
131
  model_class == "XGBClassifier"
128
132
  or model_class == "RandomForestClassifier"
129
133
  or model_class == "CatBoostClassifier"
134
+ or model_class == "DecisionTreeClassifier"
130
135
  ):
131
136
  explainer = shap.TreeExplainer(trained_model)
132
137
  elif (
133
138
  model_class == "LogisticRegression"
134
139
  or model_class == "XGBRegressor"
135
140
  or model_class == "LinearRegression"
141
+ or model_class == "LinearSVC"
136
142
  ):
137
143
  explainer = shap.LinearExplainer(trained_model, self.inputs.dataset.x)
144
+ elif model_class == "SVC":
145
+ # KernelExplainer is slow so we use shap.sample to speed it up
146
+ explainer = shap.KernelExplainer(
147
+ trained_model.predict,
148
+ shap.sample(
149
+ self.inputs.dataset.x,
150
+ self.params["kernel_explainer_samples"],
151
+ ),
152
+ )
153
+ else:
154
+ raise UnsupportedModelForSHAPError(
155
+ f"Model {model_class} not supported for SHAP importance."
156
+ )
157
+
158
+ # KernelExplainer is slow so we use shap.sample to speed it up
159
+ if isinstance(explainer, shap.KernelExplainer):
160
+ shap_sample = shap.sample(
161
+ self.inputs.dataset.x,
162
+ self.params["kernel_explainer_samples"],
163
+ )
138
164
  else:
139
- raise ValueError(f"Model {model_class} not supported for SHAP importance.")
165
+ shap_sample = self.inputs.dataset.x
140
166
 
141
- shap_values = explainer.shap_values(self.inputs.dataset.x)
167
+ shap_values = explainer.shap_values(shap_sample)
142
168
 
143
169
  figures = [
144
- self._generate_shap_plot("mean", shap_values, self.inputs.dataset.x),
145
- self._generate_shap_plot("summary", shap_values, self.inputs.dataset.x),
170
+ self._generate_shap_plot("mean", shap_values, shap_sample),
171
+ self._generate_shap_plot("summary", shap_values, shap_sample),
146
172
  ]
147
173
 
148
174
  # restore warnings
@@ -129,12 +129,12 @@ class TrainingTestDegradation(ThresholdTest):
129
129
  y_test_true = y_test_true.astype(y_test_pred.dtype)
130
130
 
131
131
  report_train = metrics.classification_report(
132
- y_train_true, y_train_pred, output_dict=True
132
+ y_train_true, y_train_pred, output_dict=True, zero_division=0
133
133
  )
134
134
  report_train["roc_auc"] = multiclass_roc_auc_score(y_train_true, y_train_pred)
135
135
 
136
136
  report_test = metrics.classification_report(
137
- y_test_true, y_test_pred, output_dict=True
137
+ y_test_true, y_test_pred, output_dict=True, zero_division=0
138
138
  )
139
139
  report_test["roc_auc"] = multiclass_roc_auc_score(y_test_true, y_test_pred)
140
140
 
@@ -145,7 +145,13 @@ class TrainingTestDegradation(ThresholdTest):
145
145
  for metric_name in ["precision", "recall", "f1-score"]:
146
146
  train_score = report_train[class_name][metric_name]
147
147
  test_score = report_test[class_name][metric_name]
148
- degradation = (train_score - test_score) / train_score
148
+
149
+ # If training score is 0, degradation is assumed to be 100%
150
+ if train_score == 0:
151
+ degradation = 1.0
152
+ else:
153
+ degradation = (train_score - test_score) / train_score
154
+
149
155
  passed = degradation < self.params["max_threshold"]
150
156
  test_results.append(
151
157
  ThresholdTestResult(
@@ -2,9 +2,10 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ from pandas import DataFrame
5
6
  from statsmodels.tsa.stattools import adfuller
6
7
 
7
- from validmind.vm_models import Metric
8
+ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
8
9
 
9
10
 
10
11
  class ADF(Metric):
@@ -51,6 +52,31 @@ class ADF(Metric):
51
52
  ],
52
53
  }
53
54
 
55
+ def summary(self, metric_value: dict):
56
+ table = DataFrame.from_dict(metric_value, orient="index")
57
+ table = table.reset_index()
58
+ table.columns = [
59
+ "Feature",
60
+ "ADF Statistic",
61
+ "P-Value",
62
+ "Used Lag",
63
+ "Number of Observations",
64
+ "Critical Values",
65
+ "IC Best",
66
+ ]
67
+ table = table.rename_axis("Index", axis=1)
68
+
69
+ return ResultSummary(
70
+ results=[
71
+ ResultTable(
72
+ data=table,
73
+ metadata=ResultTableMetadata(
74
+ title="ADF Test Results for Each Feature"
75
+ ),
76
+ ),
77
+ ]
78
+ )
79
+
54
80
  def run(self):
55
81
  """
56
82
  Calculates ADF metric for each of the dataset features
@@ -79,7 +79,7 @@ class RegressionModelsPerformance(Metric):
79
79
  def sample_performance_ols(self, models, datasets):
80
80
  evaluation_results = []
81
81
 
82
- for (model, dataset) in zip(models, datasets):
82
+ for model, dataset in zip(models, datasets):
83
83
  X_columns = dataset.get_features_columns()
84
84
  y_true = dataset.y
85
85
  y_pred = dataset.y_pred(model.input_id)
@@ -73,18 +73,6 @@ class ResidualsVisualInspection(Metric):
73
73
  "tags": ["statsmodels", "visualization"],
74
74
  }
75
75
 
76
- def get_residuals(self, column, series):
77
- """
78
- Get the seasonal decomposition residuals from the test
79
- context or re-compute them if not available. This allows
80
- running the test individually or as part of a test suite.
81
- """
82
- sd_all_columns = self.test_context.get_context_data("seasonal_decompose")
83
- if sd_all_columns is None or column not in sd_all_columns:
84
- return seasonal_decompose(series, model="additive")
85
-
86
- return sd_all_columns[column]
87
-
88
76
  @staticmethod
89
77
  def residual_analysis(residuals, variable_name, axes):
90
78
  residuals = residuals.dropna().reset_index(
@@ -115,7 +103,7 @@ class ResidualsVisualInspection(Metric):
115
103
 
116
104
  # TODO: specify which columns to plot via params
117
105
  for col in x_train.columns:
118
- sd = self.get_residuals(col, x_train[col])
106
+ sd = seasonal_decompose(x_train[col], model="additive")
119
107
 
120
108
  # Remove NaN values from the residuals and reset the index
121
109
  residuals = pd.Series(sd.resid).dropna().reset_index(drop=True)
@@ -57,6 +57,8 @@ class AIPoweredTest:
57
57
  {"role": "system", "content": system_prompt},
58
58
  {"role": "user", "content": user_prompt},
59
59
  ],
60
+ temperature=0.0,
61
+ seed=42,
60
62
  )
61
63
  .choices[0]
62
64
  .message.content
@@ -237,8 +237,6 @@ def run_metric(metric_id=None, inputs=None, params=None):
237
237
  # Run the metric
238
238
  result = metric.run()
239
239
 
240
- cache_key = get_metric_cache_key(metric_id, params, inputs)
241
-
242
240
  unit_metric_results_cache[cache_key] = result
243
241
 
244
242
  return result