validmind 2.5.15__py3-none-any.whl → 2.5.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +54 -112
- validmind/ai/test_result_description/config.yaml +29 -0
- validmind/ai/test_result_description/context.py +73 -0
- validmind/ai/test_result_description/image_processing.py +124 -0
- validmind/ai/test_result_description/system.jinja +39 -0
- validmind/ai/test_result_description/user.jinja +25 -0
- validmind/datasets/credit_risk/__init__.py +1 -0
- validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club_bias.py +142 -0
- validmind/tests/__types__.py +19 -10
- validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +20 -24
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +4 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/JarqueBera.py +22 -30
- validmind/tests/{model_validation/statsmodels → data_validation}/LJungBox.py +23 -27
- validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
- validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
- validmind/tests/{model_validation/statsmodels → data_validation}/RunsTest.py +17 -20
- validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +20 -22
- validmind/tests/data_validation/nlp/Hashtags.py +15 -20
- validmind/tests/data_validation/nlp/TextDescription.py +3 -1
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +5 -6
- validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
- validmind/tests/model_validation/sklearn/FeatureImportance.py +3 -3
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +59 -0
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +40 -20
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +0 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +1 -1
- validmind/utils.py +4 -0
- validmind/vm_models/test/metric.py +1 -0
- validmind/vm_models/test/result_wrapper.py +50 -26
- validmind/vm_models/test/threshold_test.py +1 -0
- {validmind-2.5.15.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
- {validmind-2.5.15.dist-info → validmind-2.5.18.dist-info}/RECORD +43 -30
- {validmind-2.5.15.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
- {validmind-2.5.15.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
- {validmind-2.5.15.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import io
|
6
|
+
import sys
|
7
|
+
|
8
|
+
import aequitas.plot as ap
|
9
|
+
import pandas as pd
|
10
|
+
from aequitas.bias import Bias
|
11
|
+
from aequitas.group import Group
|
12
|
+
from aequitas.plotting import Plot
|
13
|
+
|
14
|
+
from validmind import tags, tasks
|
15
|
+
from validmind.logging import get_logger
|
16
|
+
|
17
|
+
logger = get_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
@tags("bias_and_fairness")
|
21
|
+
@tasks("classification", "regression")
|
22
|
+
def ProtectedClassesDisparity(
|
23
|
+
dataset,
|
24
|
+
model,
|
25
|
+
protected_classes=None,
|
26
|
+
disparity_tolerance=1.25,
|
27
|
+
metrics=["fnr", "fpr", "tpr"],
|
28
|
+
):
|
29
|
+
"""
|
30
|
+
Investigates disparities in model performance across different protected class segments.
|
31
|
+
|
32
|
+
### Purpose
|
33
|
+
|
34
|
+
This test aims to identify and quantify potential biases in model outcomes by comparing various performance metrics
|
35
|
+
across different segments of protected classes. It helps in assessing whether the model produces discriminatory
|
36
|
+
outcomes for certain groups, which is crucial for ensuring fairness in machine learning models.
|
37
|
+
|
38
|
+
### Test Mechanism
|
39
|
+
|
40
|
+
The test performs the following steps:
|
41
|
+
1. Calculates performance metrics (e.g., false negative rate, false positive rate, true positive rate) for each segment
|
42
|
+
of the specified protected classes.
|
43
|
+
2. Computes disparity ratios by comparing these metrics between different segments and a reference group.
|
44
|
+
3. Generates visualizations showing the disparities and their relation to a user-defined disparity tolerance threshold.
|
45
|
+
4. Produces a comprehensive table with various disparity metrics for detailed analysis.
|
46
|
+
|
47
|
+
### Signs of High Risk
|
48
|
+
|
49
|
+
- Disparity ratios exceeding the specified disparity tolerance threshold.
|
50
|
+
- Consistent patterns of higher error rates or lower performance for specific protected class segments.
|
51
|
+
- Statistically significant differences in performance metrics across segments.
|
52
|
+
|
53
|
+
### Strengths
|
54
|
+
|
55
|
+
- Provides a comprehensive view of model fairness across multiple protected attributes and metrics.
|
56
|
+
- Allows for easy identification of problematic disparities through visual and tabular representations.
|
57
|
+
- Customizable disparity tolerance threshold to align with specific use-case requirements.
|
58
|
+
- Applicable to various performance metrics, offering a multi-faceted analysis of model fairness.
|
59
|
+
|
60
|
+
### Limitations
|
61
|
+
|
62
|
+
- Relies on a predefined reference group for each protected class, which may not always be the most appropriate choice.
|
63
|
+
- Does not account for intersectionality between different protected attributes.
|
64
|
+
- The interpretation of results may require domain expertise to understand the implications of observed disparities.
|
65
|
+
"""
|
66
|
+
|
67
|
+
if protected_classes is None:
|
68
|
+
logger.warning(
|
69
|
+
"No protected classes provided. Please pass the 'protected_classes' parameter to run this test."
|
70
|
+
)
|
71
|
+
return pd.DataFrame()
|
72
|
+
|
73
|
+
if sys.version_info < (3, 9):
|
74
|
+
raise RuntimeError("This test requires Python 3.9 or higher.")
|
75
|
+
|
76
|
+
df = dataset._df
|
77
|
+
|
78
|
+
for protected_class in protected_classes:
|
79
|
+
# make the dataset compatible for the python package of interest
|
80
|
+
df[protected_class] = pd.Categorical(df[protected_class]).astype("object")
|
81
|
+
|
82
|
+
df["score"] = dataset.y_pred(model).astype(int)
|
83
|
+
df["label_value"] = df[dataset.target_column].astype(int)
|
84
|
+
|
85
|
+
# let map the attributes for each protected class
|
86
|
+
# default use reference that is most observable for dictionary
|
87
|
+
attributes_and_reference_groups = {}
|
88
|
+
for protected_class in protected_classes:
|
89
|
+
attributes_and_reference_groups.update(
|
90
|
+
{protected_class: df[protected_class].value_counts().idxmax()}
|
91
|
+
)
|
92
|
+
|
93
|
+
attributes_to_audit = list(attributes_and_reference_groups.keys())
|
94
|
+
|
95
|
+
# Initialize Aequitas
|
96
|
+
g = Group()
|
97
|
+
b = Bias()
|
98
|
+
aqp = Plot()
|
99
|
+
|
100
|
+
columns_to_include = (
|
101
|
+
protected_classes + [dataset.target_column] + ["score", "label_value"]
|
102
|
+
)
|
103
|
+
|
104
|
+
# get_crosstabs returns a dataframe of the group counts and group value bias metrics.
|
105
|
+
xtab, _ = g.get_crosstabs(df[columns_to_include], attr_cols=attributes_to_audit)
|
106
|
+
bdf = b.get_disparity_predefined_groups(
|
107
|
+
xtab,
|
108
|
+
original_df=df[columns_to_include],
|
109
|
+
ref_groups_dict=attributes_and_reference_groups,
|
110
|
+
alpha=0.05,
|
111
|
+
mask_significance=True,
|
112
|
+
)
|
113
|
+
|
114
|
+
plots = []
|
115
|
+
for protected_class in protected_classes:
|
116
|
+
plot = ap.disparity(
|
117
|
+
bdf, metrics, protected_class, fairness_threshold=disparity_tolerance
|
118
|
+
)
|
119
|
+
|
120
|
+
buf = io.BytesIO() # create a bytes array to save the image into in memory
|
121
|
+
plot.save(
|
122
|
+
buf, format="png"
|
123
|
+
) # as long as the above library is installed, this will work
|
124
|
+
plots.append(buf.getvalue())
|
125
|
+
|
126
|
+
string = "_disparity"
|
127
|
+
metrics_adj = [x + string for x in metrics]
|
128
|
+
|
129
|
+
table = bdf[["attribute_name", "attribute_value"] + b.list_disparities(bdf)]
|
130
|
+
plots.append(aqp.plot_disparity_all(bdf, metrics=metrics_adj))
|
131
|
+
plots_return = tuple(plots)
|
132
|
+
|
133
|
+
return (table, *plots_return)
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import json
|
6
|
+
import sys
|
7
|
+
|
8
|
+
import matplotlib.pyplot as plt
|
9
|
+
import pandas as pd
|
10
|
+
from fairlearn.metrics import (
|
11
|
+
MetricFrame,
|
12
|
+
count,
|
13
|
+
demographic_parity_ratio,
|
14
|
+
equalized_odds_ratio,
|
15
|
+
false_negative_rate,
|
16
|
+
false_positive_rate,
|
17
|
+
true_positive_rate,
|
18
|
+
)
|
19
|
+
from fairlearn.postprocessing import ThresholdOptimizer, plot_threshold_optimizer
|
20
|
+
|
21
|
+
from validmind import tags, tasks
|
22
|
+
from validmind.logging import get_logger
|
23
|
+
|
24
|
+
logger = get_logger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
@tags("bias_and_fairness")
|
28
|
+
@tasks("classification", "regression")
|
29
|
+
def ProtectedClassesThresholdOptimizer(
|
30
|
+
dataset, pipeline=None, protected_classes=None, X_train=None, y_train=None
|
31
|
+
):
|
32
|
+
"""
|
33
|
+
Obtains a classifier by applying group-specific thresholds to the provided estimator.
|
34
|
+
|
35
|
+
### Purpose
|
36
|
+
|
37
|
+
This test aims to optimize the fairness of a machine learning model by applying different
|
38
|
+
classification thresholds for different protected groups. It helps in mitigating bias and
|
39
|
+
achieving more equitable outcomes across different demographic groups.
|
40
|
+
|
41
|
+
### Test Mechanism
|
42
|
+
|
43
|
+
The test uses Fairlearn's ThresholdOptimizer to:
|
44
|
+
1. Fit an optimizer on the training data, considering protected classes.
|
45
|
+
2. Apply optimized thresholds to make predictions on the test data.
|
46
|
+
3. Calculate and report various fairness metrics.
|
47
|
+
4. Visualize the optimized thresholds.
|
48
|
+
|
49
|
+
### Signs of High Risk
|
50
|
+
|
51
|
+
- Large disparities in fairness metrics (e.g., Demographic Parity Ratio, Equalized Odds Ratio)
|
52
|
+
across different protected groups.
|
53
|
+
- Significant differences in False Positive Rates (FPR) or True Positive Rates (TPR) between groups.
|
54
|
+
- Thresholds that vary widely across different protected groups.
|
55
|
+
|
56
|
+
### Strengths
|
57
|
+
|
58
|
+
- Provides a post-processing method to improve model fairness without modifying the original model.
|
59
|
+
- Allows for balancing multiple fairness criteria simultaneously.
|
60
|
+
- Offers visual insights into the threshold optimization process.
|
61
|
+
|
62
|
+
### Limitations
|
63
|
+
|
64
|
+
- May lead to a decrease in overall model performance while improving fairness.
|
65
|
+
- Requires access to protected attribute information at prediction time.
|
66
|
+
- The effectiveness can vary depending on the chosen fairness constraint and objective.
|
67
|
+
"""
|
68
|
+
|
69
|
+
if sys.version_info < (3, 9):
|
70
|
+
raise RuntimeError("This test requires Python 3.9 or higher.")
|
71
|
+
|
72
|
+
if (
|
73
|
+
pipeline is None
|
74
|
+
or protected_classes is None
|
75
|
+
or X_train is None
|
76
|
+
or y_train is None
|
77
|
+
):
|
78
|
+
logger.warning(
|
79
|
+
"Missing required parameters. Please provide pipeline, protected_classes, X_train, and y_train."
|
80
|
+
)
|
81
|
+
return pd.DataFrame()
|
82
|
+
|
83
|
+
test_df = dataset.df
|
84
|
+
|
85
|
+
threshold_optimizer = initialize_and_fit_optimizer(
|
86
|
+
pipeline, X_train, y_train, X_train[protected_classes]
|
87
|
+
)
|
88
|
+
|
89
|
+
fig = plot_thresholds(threshold_optimizer)
|
90
|
+
|
91
|
+
target = dataset.target_column
|
92
|
+
y_pred_opt = make_predictions(threshold_optimizer, test_df, protected_classes)
|
93
|
+
|
94
|
+
fairness_metrics = calculate_fairness_metrics(
|
95
|
+
test_df, target, y_pred_opt, protected_classes
|
96
|
+
)
|
97
|
+
|
98
|
+
return (
|
99
|
+
{"DPR and EOR Table": fairness_metrics.reset_index()},
|
100
|
+
fig,
|
101
|
+
)
|
102
|
+
|
103
|
+
|
104
|
+
def initialize_and_fit_optimizer(pipeline, X_train, y_train, protected_classes_df):
|
105
|
+
threshold_optimizer = ThresholdOptimizer(
|
106
|
+
estimator=pipeline,
|
107
|
+
objective="balanced_accuracy_score",
|
108
|
+
constraints="demographic_parity",
|
109
|
+
predict_method="predict_proba",
|
110
|
+
prefit=False,
|
111
|
+
)
|
112
|
+
threshold_optimizer.fit(X_train, y_train, sensitive_features=protected_classes_df)
|
113
|
+
return threshold_optimizer
|
114
|
+
|
115
|
+
|
116
|
+
def plot_thresholds(threshold_optimizer):
|
117
|
+
fig = plt.figure()
|
118
|
+
plot_threshold_optimizer(threshold_optimizer, show_plot=False)
|
119
|
+
return fig
|
120
|
+
|
121
|
+
|
122
|
+
def make_predictions(threshold_optimizer, test_df, protected_classes):
|
123
|
+
y_pred_opt = threshold_optimizer.predict(
|
124
|
+
test_df, sensitive_features=test_df[protected_classes]
|
125
|
+
)
|
126
|
+
return y_pred_opt
|
127
|
+
|
128
|
+
|
129
|
+
def calculate_fairness_metrics(test_df, target, y_pred_opt, protected_classes):
|
130
|
+
fairness_metrics = pd.DataFrame(
|
131
|
+
columns=protected_classes,
|
132
|
+
index=["demographic parity ratio", "equal odds ratio"],
|
133
|
+
)
|
134
|
+
|
135
|
+
for feature in protected_classes:
|
136
|
+
dpr = demographic_parity_ratio(
|
137
|
+
y_true=test_df[target],
|
138
|
+
y_pred=y_pred_opt,
|
139
|
+
sensitive_features=test_df[[feature]],
|
140
|
+
)
|
141
|
+
eor = equalized_odds_ratio(
|
142
|
+
y_true=test_df[target],
|
143
|
+
y_pred=y_pred_opt,
|
144
|
+
sensitive_features=test_df[[feature]],
|
145
|
+
)
|
146
|
+
fairness_metrics[feature] = [round(dpr, 2), round(eor, 2)]
|
147
|
+
|
148
|
+
return fairness_metrics
|
149
|
+
|
150
|
+
|
151
|
+
def calculate_group_metrics(test_df, target, y_pred_opt, protected_classes):
|
152
|
+
metrics = {
|
153
|
+
"fpr": false_positive_rate,
|
154
|
+
"tpr": true_positive_rate,
|
155
|
+
"fnr": false_negative_rate,
|
156
|
+
"count": count,
|
157
|
+
}
|
158
|
+
mf = MetricFrame(
|
159
|
+
metrics=metrics,
|
160
|
+
y_true=test_df[target],
|
161
|
+
y_pred=y_pred_opt,
|
162
|
+
sensitive_features=test_df[protected_classes],
|
163
|
+
)
|
164
|
+
group_metrics = mf.by_group
|
165
|
+
return group_metrics
|
166
|
+
|
167
|
+
|
168
|
+
def get_thresholds_by_group(threshold_optimizer):
|
169
|
+
threshold_rules = threshold_optimizer.interpolated_thresholder_.interpolation_dict
|
170
|
+
thresholds = json.dumps(threshold_rules, default=str, indent=4)
|
171
|
+
thresholds_df = pd.DataFrame.from_records(json.loads(thresholds))
|
172
|
+
return thresholds_df
|
@@ -2,12 +2,15 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import pandas as pd
|
5
6
|
from statsmodels.sandbox.stats.runs import runstest_1samp
|
6
7
|
|
7
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
8
9
|
|
9
10
|
|
10
|
-
|
11
|
+
@tasks("classification", "regression")
|
12
|
+
@tags("tabular_data", "statistical_test", "statsmodels")
|
13
|
+
def RunsTest(dataset):
|
11
14
|
"""
|
12
15
|
Executes Runs Test on ML model to detect non-random patterns in output data sequence.
|
13
16
|
|
@@ -52,24 +55,18 @@ class RunsTest(Metric):
|
|
52
55
|
- Does not provide model performance evaluation; it is used to detect patterns in the sequence of outputs only.
|
53
56
|
"""
|
54
57
|
|
55
|
-
|
56
|
-
required_inputs = ["dataset"]
|
57
|
-
tasks = ["classification", "regression"]
|
58
|
-
tags = ["tabular_data", "statistical_test", "statsmodels"]
|
58
|
+
df = dataset.df[dataset.feature_columns_numeric]
|
59
59
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
60
|
+
runs_test_values = {}
|
61
|
+
for col in df.columns:
|
62
|
+
runs_stat, runs_p_value = runstest_1samp(df[col].values)
|
63
|
+
runs_test_values[col] = {
|
64
|
+
"stat": runs_stat,
|
65
|
+
"pvalue": runs_p_value,
|
66
|
+
}
|
65
67
|
|
66
|
-
|
67
|
-
|
68
|
-
|
68
|
+
runs_test_df = pd.DataFrame.from_dict(runs_test_values, orient="index")
|
69
|
+
runs_test_df.reset_index(inplace=True)
|
70
|
+
runs_test_df.columns = ["feature", "stat", "pvalue"]
|
69
71
|
|
70
|
-
|
71
|
-
"stat": runs_stat,
|
72
|
-
"pvalue": runs_p_value,
|
73
|
-
}
|
74
|
-
|
75
|
-
return self.cache_results(runs_test_values)
|
72
|
+
return runs_test_df
|
@@ -2,12 +2,15 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import pandas as pd
|
5
6
|
from scipy import stats
|
6
7
|
|
7
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
8
9
|
|
9
10
|
|
10
|
-
|
11
|
+
@tasks("classification", "regression")
|
12
|
+
@tags("tabular_data", "data_distribution", "statistical_test")
|
13
|
+
def ShapiroWilk(dataset):
|
11
14
|
"""
|
12
15
|
Evaluates feature-wise normality of training data using the Shapiro-Wilk test.
|
13
16
|
|
@@ -49,23 +52,18 @@ class ShapiroWilk(Metric):
|
|
49
52
|
- Lastly, the Shapiro-Wilk test is not optimally suited for processing data with pronounced skewness or kurtosis.
|
50
53
|
"""
|
51
54
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
"stat": sw_stat,
|
68
|
-
"pvalue": sw_pvalue,
|
69
|
-
}
|
70
|
-
|
71
|
-
return self.cache_results(sw_values)
|
55
|
+
df = dataset.df[dataset.feature_columns_numeric]
|
56
|
+
|
57
|
+
sw_values = {}
|
58
|
+
for col in df.columns:
|
59
|
+
sw_stat, sw_pvalue = stats.shapiro(df[col].values)
|
60
|
+
sw_values[col] = {
|
61
|
+
"stat": sw_stat,
|
62
|
+
"pvalue": sw_pvalue,
|
63
|
+
}
|
64
|
+
|
65
|
+
sw_df = pd.DataFrame.from_dict(sw_values, orient="index")
|
66
|
+
sw_df.reset_index(inplace=True)
|
67
|
+
sw_df.columns = ["column", "stat", "pvalue"]
|
68
|
+
|
69
|
+
return sw_df
|
@@ -9,8 +9,7 @@ Threshold based tests
|
|
9
9
|
import re
|
10
10
|
from dataclasses import dataclass
|
11
11
|
|
12
|
-
import
|
13
|
-
import seaborn as sns
|
12
|
+
import plotly.graph_objects as go
|
14
13
|
|
15
14
|
from validmind.vm_models import Figure, ThresholdTest, VMDataset
|
16
15
|
|
@@ -74,25 +73,23 @@ class Hashtags(ThresholdTest):
|
|
74
73
|
text_column = self.inputs.dataset.text_column
|
75
74
|
|
76
75
|
def find_hash(text):
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
)
|
85
|
-
temp = (
|
86
|
-
temp.to_frame()
|
87
|
-
.reset_index()
|
88
|
-
.rename(columns={"index": "Hashtag", text_column: "count"})
|
89
|
-
)
|
76
|
+
return re.findall(r"(?<=#)\w+", str(text))
|
77
|
+
|
78
|
+
# Extract hashtags from the text column and count occurrences
|
79
|
+
hashtags = self.inputs.dataset.df[text_column].apply(find_hash).explode()
|
80
|
+
temp = hashtags.value_counts().head(self.params["top_hashtags"])
|
81
|
+
|
82
|
+
print(f"temp: {temp}")
|
90
83
|
|
91
84
|
figures = []
|
92
85
|
if not temp.empty:
|
93
|
-
fig =
|
94
|
-
|
95
|
-
|
86
|
+
fig = go.Figure(data=[go.Bar(x=temp.index, y=temp.values)])
|
87
|
+
fig.update_layout(
|
88
|
+
title="Top Hashtags",
|
89
|
+
xaxis_title="Hashtag",
|
90
|
+
yaxis_title="Count",
|
91
|
+
xaxis_tickangle=-45,
|
92
|
+
)
|
96
93
|
figures.append(
|
97
94
|
Figure(
|
98
95
|
for_object=self,
|
@@ -100,7 +97,5 @@ class Hashtags(ThresholdTest):
|
|
100
97
|
figure=fig,
|
101
98
|
)
|
102
99
|
)
|
103
|
-
# Do this if you want to prevent the figure from being displayed
|
104
|
-
plt.close("all")
|
105
100
|
|
106
101
|
return self.cache_results([], passed=True, figures=figures)
|
@@ -84,7 +84,6 @@ class TextDescription(Metric):
|
|
84
84
|
tags = ["nlp", "text_data", "visualization"]
|
85
85
|
|
86
86
|
def general_text_metrics(self, df, text_column):
|
87
|
-
nltk.download("punkt", quiet=True)
|
88
87
|
results = []
|
89
88
|
|
90
89
|
for text in df[text_column]:
|
@@ -175,6 +174,9 @@ class TextDescription(Metric):
|
|
175
174
|
if not isinstance(self.inputs.dataset, VMDataset):
|
176
175
|
raise ValueError("TextDescription requires a validmind Dataset object")
|
177
176
|
|
177
|
+
# download nltk data
|
178
|
+
nltk.download("punkt_tab", quiet=True)
|
179
|
+
|
178
180
|
df_text_description = self.text_description_table(
|
179
181
|
self.inputs.dataset.df, self.params
|
180
182
|
)
|
@@ -58,6 +58,9 @@ def ContextualRecall(dataset, model):
|
|
58
58
|
- Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
|
59
59
|
"""
|
60
60
|
|
61
|
+
# download nltk data
|
62
|
+
nltk.download("punkt_tab", quiet=True)
|
63
|
+
|
61
64
|
y_true = dataset.y
|
62
65
|
y_pred = dataset.y_pred(model)
|
63
66
|
|
@@ -103,8 +103,8 @@ def AspectCritique(
|
|
103
103
|
"""
|
104
104
|
try:
|
105
105
|
from ragas import evaluate
|
106
|
-
from ragas.metrics
|
107
|
-
from ragas.metrics.
|
106
|
+
from ragas.metrics import AspectCritic
|
107
|
+
from ragas.metrics._aspect_critic import (
|
108
108
|
coherence,
|
109
109
|
conciseness,
|
110
110
|
correctness,
|
@@ -114,7 +114,7 @@ def AspectCritique(
|
|
114
114
|
except ImportError:
|
115
115
|
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
116
116
|
|
117
|
-
|
117
|
+
built_in_aspects = {
|
118
118
|
"coherence": coherence,
|
119
119
|
"conciseness": conciseness,
|
120
120
|
"correctness": correctness,
|
@@ -136,16 +136,15 @@ def AspectCritique(
|
|
136
136
|
|
137
137
|
df = get_renamed_columns(dataset._df, required_columns)
|
138
138
|
|
139
|
-
built_in_aspects = [aspect_map[aspect] for aspect in aspects]
|
140
139
|
custom_aspects = (
|
141
140
|
[
|
142
|
-
|
141
|
+
AspectCritic(name=name, definition=description)
|
143
142
|
for name, description in additional_aspects
|
144
143
|
]
|
145
144
|
if additional_aspects
|
146
145
|
else []
|
147
146
|
)
|
148
|
-
all_aspects = [
|
147
|
+
all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
|
149
148
|
|
150
149
|
result_df = evaluate(
|
151
150
|
Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
|