validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +80 -119
- validmind/ai/test_result_description/config.yaml +29 -0
- validmind/ai/test_result_description/context.py +73 -0
- validmind/ai/test_result_description/image_processing.py +124 -0
- validmind/ai/test_result_description/system.jinja +39 -0
- validmind/ai/test_result_description/user.jinja +25 -0
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/credit_risk/__init__.py +1 -0
- validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club_bias.py +142 -0
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +14 -15
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/JarqueBera.py +70 -0
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LJungBox.py +66 -0
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
- validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/RunsTest.py +72 -0
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +42 -40
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +39 -36
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +38 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/utils.py +4 -0
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/metric.py +1 -0
- validmind/vm_models/test/result_wrapper.py +143 -158
- validmind/vm_models/test/threshold_test.py +1 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
- validmind-2.5.18.dist-info/RECORD +324 -0
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
- validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
- validmind-2.5.8.dist-info/RECORD +0 -318
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -11,6 +11,8 @@ from validmind import tags, tasks
|
|
11
11
|
|
12
12
|
from .utils import get_ragas_config, get_renamed_columns
|
13
13
|
|
14
|
+
LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
|
15
|
+
|
14
16
|
|
15
17
|
@tags("ragas", "llm", "qualitative")
|
16
18
|
@tasks("text_summarization", "text_generation", "text_qa")
|
@@ -101,8 +103,8 @@ def AspectCritique(
|
|
101
103
|
"""
|
102
104
|
try:
|
103
105
|
from ragas import evaluate
|
104
|
-
from ragas.metrics
|
105
|
-
from ragas.metrics.
|
106
|
+
from ragas.metrics import AspectCritic
|
107
|
+
from ragas.metrics._aspect_critic import (
|
106
108
|
coherence,
|
107
109
|
conciseness,
|
108
110
|
correctness,
|
@@ -112,7 +114,7 @@ def AspectCritique(
|
|
112
114
|
except ImportError:
|
113
115
|
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
114
116
|
|
115
|
-
|
117
|
+
built_in_aspects = {
|
116
118
|
"coherence": coherence,
|
117
119
|
"conciseness": conciseness,
|
118
120
|
"correctness": correctness,
|
@@ -134,21 +136,25 @@ def AspectCritique(
|
|
134
136
|
|
135
137
|
df = get_renamed_columns(dataset._df, required_columns)
|
136
138
|
|
137
|
-
built_in_aspects = [aspect_map[aspect] for aspect in aspects]
|
138
139
|
custom_aspects = (
|
139
140
|
[
|
140
|
-
|
141
|
+
AspectCritic(name=name, definition=description)
|
141
142
|
for name, description in additional_aspects
|
142
143
|
]
|
143
144
|
if additional_aspects
|
144
145
|
else []
|
145
146
|
)
|
146
|
-
all_aspects = [
|
147
|
+
all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
|
147
148
|
|
148
149
|
result_df = evaluate(
|
149
150
|
Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
|
150
151
|
).to_pandas()
|
151
152
|
|
153
|
+
# reverse the score for aspects where lower is better
|
154
|
+
for aspect in LOWER_IS_BETTER_ASPECTS:
|
155
|
+
if aspect in result_df.columns:
|
156
|
+
result_df[aspect] = 1 - result_df[aspect]
|
157
|
+
|
152
158
|
df_melted = result_df.melt(
|
153
159
|
id_vars=["question", "answer", "contexts"],
|
154
160
|
value_vars=[aspect.name for aspect in all_aspects],
|
@@ -47,6 +47,7 @@ def ContextEntityRecall(
|
|
47
47
|
### Configuring Columns
|
48
48
|
|
49
49
|
This metric requires the following columns in your dataset:
|
50
|
+
|
50
51
|
- `contexts` (List[str]): A list of text contexts which will be evaluated to make
|
51
52
|
sure if they contain the entities present in the ground truth.
|
52
53
|
- `ground_truth` (str): The ground truth text from which the entities will be
|
@@ -113,13 +114,13 @@ def ContextEntityRecall(
|
|
113
114
|
|
114
115
|
return (
|
115
116
|
{
|
116
|
-
"Scores (will not be uploaded to UI)": result_df[
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
],
|
117
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
118
|
+
# [
|
119
|
+
# "contexts",
|
120
|
+
# "ground_truth",
|
121
|
+
# "context_entity_recall",
|
122
|
+
# ]
|
123
|
+
# ],
|
123
124
|
"Aggregate Scores": [
|
124
125
|
{
|
125
126
|
"Mean Score": result_df["context_entity_recall"].mean(),
|
@@ -127,7 +128,7 @@ def ContextEntityRecall(
|
|
127
128
|
"Max Score": result_df["context_entity_recall"].max(),
|
128
129
|
"Min Score": result_df["context_entity_recall"].min(),
|
129
130
|
"Standard Deviation": result_df["context_entity_recall"].std(),
|
130
|
-
"Count":
|
131
|
+
"Count": result_df.shape[0],
|
131
132
|
}
|
132
133
|
],
|
133
134
|
},
|
@@ -40,6 +40,7 @@ def ContextPrecision(
|
|
40
40
|
### Configuring Columns
|
41
41
|
|
42
42
|
This metric requires the following columns in your dataset:
|
43
|
+
|
43
44
|
- `question` (str): The text query that was input into the model.
|
44
45
|
- `contexts` (List[str]): A list of text contexts which are retrieved and which
|
45
46
|
will be evaluated to make sure they contain relevant info in the correct order.
|
@@ -107,9 +108,9 @@ def ContextPrecision(
|
|
107
108
|
|
108
109
|
return (
|
109
110
|
{
|
110
|
-
"Scores (will not be uploaded to UI)": result_df[
|
111
|
-
|
112
|
-
],
|
111
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
112
|
+
# ["question", "contexts", "ground_truth", "context_precision"]
|
113
|
+
# ],
|
113
114
|
"Aggregate Scores": [
|
114
115
|
{
|
115
116
|
"Mean Score": result_df["context_precision"].mean(),
|
@@ -117,7 +118,7 @@ def ContextPrecision(
|
|
117
118
|
"Max Score": result_df["context_precision"].max(),
|
118
119
|
"Min Score": result_df["context_precision"].min(),
|
119
120
|
"Standard Deviation": result_df["context_precision"].std(),
|
120
|
-
"Count":
|
121
|
+
"Count": result_df.shape[0],
|
121
122
|
}
|
122
123
|
],
|
123
124
|
},
|
@@ -40,6 +40,7 @@ def ContextRecall(
|
|
40
40
|
### Configuring Columns
|
41
41
|
|
42
42
|
This metric requires the following columns in your dataset:
|
43
|
+
|
43
44
|
- `question` (str): The text query that was input into the model.
|
44
45
|
- `contexts` (List[str]): A list of text contexts which are retrieved and which
|
45
46
|
will be evaluated to make sure they contain all items in the ground truth.
|
@@ -107,9 +108,9 @@ def ContextRecall(
|
|
107
108
|
|
108
109
|
return (
|
109
110
|
{
|
110
|
-
"Scores (will not be uploaded to UI)": result_df[
|
111
|
-
|
112
|
-
],
|
111
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
112
|
+
# ["question", "contexts", "ground_truth", "context_recall"]
|
113
|
+
# ],
|
113
114
|
"Aggregate Scores": [
|
114
115
|
{
|
115
116
|
"Mean Score": result_df["context_recall"].mean(),
|
@@ -117,7 +118,7 @@ def ContextRecall(
|
|
117
118
|
"Max Score": result_df["context_recall"].max(),
|
118
119
|
"Min Score": result_df["context_recall"].min(),
|
119
120
|
"Standard Deviation": result_df["context_recall"].std(),
|
120
|
-
"Count":
|
121
|
+
"Count": result_df.shape[0],
|
121
122
|
}
|
122
123
|
],
|
123
124
|
},
|
@@ -0,0 +1,155 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
|
12
|
+
from .utils import get_ragas_config, get_renamed_columns
|
13
|
+
|
14
|
+
|
15
|
+
@tags("ragas", "llm", "retrieval_performance")
|
16
|
+
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
17
|
+
def ContextUtilization(
|
18
|
+
dataset,
|
19
|
+
question_column: str = "question",
|
20
|
+
contexts_column: str = "contexts",
|
21
|
+
answer_column: str = "answer",
|
22
|
+
): # noqa: B950
|
23
|
+
"""
|
24
|
+
Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
|
25
|
+
within the provided contexts.
|
26
|
+
|
27
|
+
### Purpose
|
28
|
+
|
29
|
+
The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
|
30
|
+
higher within the provided retrieval results. This metric is essential for assessing the performance of models,
|
31
|
+
especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
|
32
|
+
|
33
|
+
### Test Mechanism
|
34
|
+
|
35
|
+
The test calculates Context Utilization using the formula:
|
36
|
+
|
37
|
+
$$
|
38
|
+
\\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
|
39
|
+
$$
|
40
|
+
$$
|
41
|
+
\\text{Precision@k} = {\\text{true positives@k} \\over (\\text{true positives@k} + \\text{false positives@k})}
|
42
|
+
$$
|
43
|
+
|
44
|
+
Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
|
45
|
+
|
46
|
+
|
47
|
+
This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
|
48
|
+
scores, generating a histogram and box plot for visualization.
|
49
|
+
|
50
|
+
#### Configuring Columns
|
51
|
+
|
52
|
+
This metric requires the following columns in your dataset:
|
53
|
+
|
54
|
+
- `question` (str): The text query that was input into the model.
|
55
|
+
- `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
|
56
|
+
make sure they contain relevant info in the correct order.
|
57
|
+
- `answer` (str): The llm-generated response for the input `question`.
|
58
|
+
|
59
|
+
If the above data is not in the appropriate column, you can specify different column
|
60
|
+
names for these fields using the parameters `question_column`, `contexts_column`
|
61
|
+
and `ground_truth_column`.
|
62
|
+
|
63
|
+
For example, if your dataset has this data stored in different columns, you can
|
64
|
+
pass the following parameters:
|
65
|
+
```python
|
66
|
+
{
|
67
|
+
"question_column": "question",
|
68
|
+
"contexts_column": "context_info"
|
69
|
+
"ground_truth_column": "my_ground_truth_col",
|
70
|
+
}
|
71
|
+
```
|
72
|
+
|
73
|
+
If the data is stored as a dictionary in another column, specify the column and key
|
74
|
+
like this:
|
75
|
+
```python
|
76
|
+
pred_col = dataset.prediction_column(model)
|
77
|
+
params = {
|
78
|
+
"contexts_column": f"{pred_col}.contexts",
|
79
|
+
"ground_truth_column": "my_ground_truth_col",
|
80
|
+
}
|
81
|
+
```
|
82
|
+
|
83
|
+
For more complex situations, you can use a function to extract the data:
|
84
|
+
```python
|
85
|
+
pred_col = dataset.prediction_column(model)
|
86
|
+
params = {
|
87
|
+
"contexts_column": lambda x: [x[pred_col]["context_message"]],
|
88
|
+
"ground_truth_column": "my_ground_truth_col",
|
89
|
+
}
|
90
|
+
```
|
91
|
+
|
92
|
+
### Signs of High Risk
|
93
|
+
|
94
|
+
- Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
|
95
|
+
- High standard deviation, suggesting inconsistent model performance.
|
96
|
+
- Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
|
97
|
+
|
98
|
+
### Strengths
|
99
|
+
|
100
|
+
- Quantifies the rank of relevant context chunks in generating responses.
|
101
|
+
- Provides clear visualizations through histograms and box plots for ease of interpretation.
|
102
|
+
- Adapts to different dataset schema by allowing configurable column names.
|
103
|
+
|
104
|
+
### Limitations
|
105
|
+
|
106
|
+
- Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
|
107
|
+
- Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
|
108
|
+
- Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
|
109
|
+
"""
|
110
|
+
try:
|
111
|
+
from ragas import evaluate
|
112
|
+
from ragas.metrics import context_utilization
|
113
|
+
except ImportError:
|
114
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
115
|
+
|
116
|
+
warnings.filterwarnings(
|
117
|
+
"ignore",
|
118
|
+
category=FutureWarning,
|
119
|
+
message="promote has been superseded by promote_options='default'.",
|
120
|
+
)
|
121
|
+
|
122
|
+
required_columns = {
|
123
|
+
"question": question_column,
|
124
|
+
"contexts": contexts_column,
|
125
|
+
"answer": answer_column,
|
126
|
+
}
|
127
|
+
|
128
|
+
df = get_renamed_columns(dataset._df, required_columns)
|
129
|
+
|
130
|
+
result_df = evaluate(
|
131
|
+
Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
|
132
|
+
).to_pandas()
|
133
|
+
|
134
|
+
fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
|
135
|
+
fig_box = px.box(x=result_df["context_utilization"].to_list())
|
136
|
+
|
137
|
+
return (
|
138
|
+
{
|
139
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
140
|
+
# ["question", "contexts", "answer", "context_utilization"]
|
141
|
+
# ],
|
142
|
+
"Aggregate Scores": [
|
143
|
+
{
|
144
|
+
"Mean Score": result_df["context_utilization"].mean(),
|
145
|
+
"Median Score": result_df["context_utilization"].median(),
|
146
|
+
"Max Score": result_df["context_utilization"].max(),
|
147
|
+
"Min Score": result_df["context_utilization"].min(),
|
148
|
+
"Standard Deviation": result_df["context_utilization"].std(),
|
149
|
+
"Count": result_df.shape[0],
|
150
|
+
}
|
151
|
+
],
|
152
|
+
},
|
153
|
+
fig_histogram,
|
154
|
+
fig_box,
|
155
|
+
)
|
@@ -41,6 +41,7 @@ def Faithfulness(
|
|
41
41
|
### Configuring Columns
|
42
42
|
|
43
43
|
This metric requires the following columns in your dataset:
|
44
|
+
|
44
45
|
- `contexts` (List[str]): A list of text contexts which are retrieved to generate
|
45
46
|
the answer.
|
46
47
|
- `answer` (str): The response generated by the model which will be evaluated for
|
@@ -105,9 +106,9 @@ def Faithfulness(
|
|
105
106
|
|
106
107
|
return (
|
107
108
|
{
|
108
|
-
"Scores (will not be uploaded to UI)": result_df[
|
109
|
-
|
110
|
-
],
|
109
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
110
|
+
# ["contexts", "answer", "faithfulness"]
|
111
|
+
# ],
|
111
112
|
"Aggregate Scores": [
|
112
113
|
{
|
113
114
|
"Mean Score": result_df["faithfulness"].mean(),
|
@@ -115,7 +116,7 @@ def Faithfulness(
|
|
115
116
|
"Max Score": result_df["faithfulness"].max(),
|
116
117
|
"Min Score": result_df["faithfulness"].min(),
|
117
118
|
"Standard Deviation": result_df["faithfulness"].std(),
|
118
|
-
"Count":
|
119
|
+
"Count": result_df.shape[0],
|
119
120
|
}
|
120
121
|
],
|
121
122
|
},
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
|
12
|
+
from .utils import get_ragas_config, get_renamed_columns
|
13
|
+
|
14
|
+
|
15
|
+
@tags("ragas", "llm", "rag_performance")
|
16
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
17
|
+
def NoiseSensitivity(
|
18
|
+
dataset,
|
19
|
+
answer_column="answer",
|
20
|
+
contexts_column="contexts",
|
21
|
+
ground_truth_column="ground_truth",
|
22
|
+
):
|
23
|
+
"""
|
24
|
+
Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
|
25
|
+
generates incorrect responses.
|
26
|
+
|
27
|
+
### Purpose
|
28
|
+
|
29
|
+
The Noise Sensitivity test aims to measure how sensitive an LLM is to irrelevant or noisy information within the
|
30
|
+
contextual data used to generate its responses. A lower noise sensitivity score suggests better model robustness in
|
31
|
+
generating accurate answers from given contexts.
|
32
|
+
|
33
|
+
### Test Mechanism
|
34
|
+
|
35
|
+
This test evaluates the model's answers by comparing the claims made in the generated response against the ground
|
36
|
+
truth and the retrieved context. The noise sensitivity score is calculated as:
|
37
|
+
|
38
|
+
$$
|
39
|
+
\\text{noise sensitivity} = {|\\text{Number of incorrect claims in answer}| \\over |\\text{Number of total claims in answer}|}
|
40
|
+
$$
|
41
|
+
|
42
|
+
The formula computes the fraction of incorrect claims to the total claims in the answer, using a dataset where
|
43
|
+
'answer', 'context', and 'ground_truth' columns are specified.
|
44
|
+
|
45
|
+
#### Configuring Columns
|
46
|
+
|
47
|
+
This metric requires the following columns in your dataset:
|
48
|
+
|
49
|
+
- `contexts` (List[str]): A list of text contexts which are retrieved to generate
|
50
|
+
the answer.
|
51
|
+
- `answer` (str): The response generated by the model
|
52
|
+
- `ground_truth` (str): The "correct" answer to the question
|
53
|
+
|
54
|
+
If the above data is not in the appropriate column, you can specify different column
|
55
|
+
names for these fields using the parameters `contexts_column` and `answer_column`.
|
56
|
+
|
57
|
+
For example, if your dataset has this data stored in different columns, you can
|
58
|
+
pass the following parameters:
|
59
|
+
```python
|
60
|
+
{
|
61
|
+
"contexts_column": "context_info"
|
62
|
+
"answer_column": "my_answer_col",
|
63
|
+
}
|
64
|
+
```
|
65
|
+
|
66
|
+
If the data is stored as a dictionary in another column, specify the column and key
|
67
|
+
like this:
|
68
|
+
```python
|
69
|
+
pred_col = dataset.prediction_column(model)
|
70
|
+
params = {
|
71
|
+
"contexts_column": f"{pred_col}.contexts",
|
72
|
+
"answer_column": f"{pred_col}.answer",
|
73
|
+
}
|
74
|
+
```
|
75
|
+
|
76
|
+
For more complex situations, you can use a function to extract the data:
|
77
|
+
```python
|
78
|
+
pred_col = dataset.prediction_column(model)
|
79
|
+
params = {
|
80
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
81
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
82
|
+
}
|
83
|
+
|
84
|
+
### Signs of High Risk
|
85
|
+
|
86
|
+
- High noise sensitivity scores across multiple samples.
|
87
|
+
- Significant deviation between mean and median noise sensitivity scores.
|
88
|
+
- High standard deviation indicating inconsistency in the model's performance.
|
89
|
+
|
90
|
+
### Strengths
|
91
|
+
|
92
|
+
- Provides a quantitative measure of how well the LLM handles noisy or irrelevant context.
|
93
|
+
- Easy integration and configuration using column parameters.
|
94
|
+
- Utilizes both histogram and box plot visualizations to analyze score distribution.
|
95
|
+
|
96
|
+
### Limitations
|
97
|
+
|
98
|
+
- Requires accurate ground truth that aligns with the generated answers.
|
99
|
+
- Assumes the context provided is sufficiently granular to assess noise sensitivity.
|
100
|
+
- Primarily applicable to tasks like text QA, text generation, and text summarization where contextual relevance is
|
101
|
+
critical.
|
102
|
+
"""
|
103
|
+
try:
|
104
|
+
from ragas import evaluate
|
105
|
+
from ragas.metrics import noise_sensitivity_relevant
|
106
|
+
except ImportError:
|
107
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
108
|
+
|
109
|
+
warnings.filterwarnings(
|
110
|
+
"ignore",
|
111
|
+
category=FutureWarning,
|
112
|
+
message="promote has been superseded by promote_options='default'.",
|
113
|
+
)
|
114
|
+
|
115
|
+
required_columns = {
|
116
|
+
"answer": answer_column,
|
117
|
+
"contexts": contexts_column,
|
118
|
+
"ground_truth": ground_truth_column,
|
119
|
+
}
|
120
|
+
|
121
|
+
df = get_renamed_columns(dataset._df, required_columns)
|
122
|
+
|
123
|
+
result_df = evaluate(
|
124
|
+
Dataset.from_pandas(df),
|
125
|
+
metrics=[noise_sensitivity_relevant],
|
126
|
+
**get_ragas_config(),
|
127
|
+
).to_pandas()
|
128
|
+
|
129
|
+
fig_histogram = px.histogram(
|
130
|
+
x=result_df["noise_sensitivity_relevant"].to_list(), nbins=10
|
131
|
+
)
|
132
|
+
fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
|
133
|
+
|
134
|
+
return (
|
135
|
+
{
|
136
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
137
|
+
# ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
|
138
|
+
# ],
|
139
|
+
"Aggregate Scores": [
|
140
|
+
{
|
141
|
+
"Mean Score": result_df["noise_sensitivity_relevant"].mean(),
|
142
|
+
"Median Score": result_df["noise_sensitivity_relevant"].median(),
|
143
|
+
"Max Score": result_df["noise_sensitivity_relevant"].max(),
|
144
|
+
"Min Score": result_df["noise_sensitivity_relevant"].min(),
|
145
|
+
"Standard Deviation": result_df["noise_sensitivity_relevant"].std(),
|
146
|
+
"Count": result_df.shape[0],
|
147
|
+
}
|
148
|
+
],
|
149
|
+
},
|
150
|
+
fig_histogram,
|
151
|
+
fig_box,
|
152
|
+
)
|
@@ -5,11 +5,17 @@
|
|
5
5
|
import os
|
6
6
|
|
7
7
|
from validmind.ai.utils import get_client_and_model
|
8
|
+
from validmind.client_config import client_config
|
8
9
|
|
9
10
|
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
10
11
|
|
11
12
|
|
12
13
|
def get_ragas_config():
|
14
|
+
if not client_config.can_generate_llm_test_descriptions():
|
15
|
+
raise ValueError(
|
16
|
+
"LLM based descriptions are not enabled in the current configuration."
|
17
|
+
)
|
18
|
+
|
13
19
|
# import here since its an optional dependency
|
14
20
|
try:
|
15
21
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
@@ -15,29 +15,36 @@ class AdjustedMutualInformation(ClusterPerformance):
|
|
15
15
|
Evaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting
|
16
16
|
for chance.
|
17
17
|
|
18
|
-
|
19
|
-
machine learning model, more specifically, a clustering model. It measures the mutual information between the true
|
20
|
-
labels and the ones predicted by the model, adjusting for chance.
|
18
|
+
### Purpose
|
21
19
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
20
|
+
The purpose of this metric (Adjusted Mutual Information) is to evaluate the performance of a machine learning
|
21
|
+
model, more specifically, a clustering model. It measures the mutual information between the true labels and the
|
22
|
+
ones predicted by the model, adjusting for chance.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
The Adjusted Mutual Information (AMI) uses sklearn's `adjusted_mutual_info_score` function. This function
|
27
|
+
calculates the mutual information between the true labels and the ones predicted while correcting for the chance
|
28
|
+
correlation expected due to random label assignments. This test requires the model, the training dataset, and the
|
29
|
+
test dataset as inputs.
|
30
|
+
|
31
|
+
### Signs of High Risk
|
26
32
|
|
27
|
-
**3. Signs of High Risk**:
|
28
33
|
- Low Adjusted Mutual Information Score: This score ranges between 0 and 1. A low score (closer to 0) can indicate
|
29
34
|
poor model performance as the predicted labels do not align well with the true labels.
|
30
|
-
- In case of high
|
35
|
+
- In case of high-dimensional data, if the algorithm shows high scores, this could also be a potential risk as AMI
|
31
36
|
may not perform reliably.
|
32
37
|
|
33
|
-
|
38
|
+
### Strengths
|
39
|
+
|
34
40
|
- The AMI metric takes into account the randomness of the predicted labels, which makes it more robust than the
|
35
41
|
simple Mutual Information.
|
36
42
|
- The scale of AMI is not dependent on the sizes of the clustering, allowing for comparability between different
|
37
43
|
datasets or models.
|
38
44
|
- Good for comparing the output of clustering algorithms where the number of clusters is not known a priori.
|
39
45
|
|
40
|
-
|
46
|
+
### Limitations
|
47
|
+
|
41
48
|
- Adjusted Mutual Information does not take into account the continuous nature of some data. As a result, it may
|
42
49
|
not be the best choice for regression or other continuous types of tasks.
|
43
50
|
- AMI has the drawback of being biased towards clusterings with a higher number of clusters.
|
@@ -47,7 +54,7 @@ class AdjustedMutualInformation(ClusterPerformance):
|
|
47
54
|
"""
|
48
55
|
|
49
56
|
name = "adjusted_mutual_information"
|
50
|
-
required_inputs = ["model", "
|
57
|
+
required_inputs = ["model", "dataset"]
|
51
58
|
tasks = ["clustering"]
|
52
59
|
tags = [
|
53
60
|
"sklearn",
|
@@ -15,38 +15,43 @@ class AdjustedRandIndex(ClusterPerformance):
|
|
15
15
|
Measures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine
|
16
16
|
learning models.
|
17
17
|
|
18
|
-
|
18
|
+
### Purpose
|
19
|
+
|
19
20
|
The Adjusted Rand Index (ARI) metric is intended to measure the similarity between two data clusters. This metric
|
20
|
-
is specifically
|
21
|
-
|
22
|
-
|
21
|
+
is specifically used for clustering machine learning models to quantify how well the model is clustering and
|
22
|
+
producing data groups. It involves comparing the model's produced clusters against the actual (true) clusters found
|
23
|
+
in the dataset.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
The Adjusted Rand Index (ARI) is calculated using the `adjusted_rand_score` method from the `sklearn.metrics`
|
28
|
+
module in Python. The test requires inputs including the model itself and the model's training and test datasets.
|
29
|
+
The model's computed clusters and the true clusters are compared, and the similarities are measured to compute the
|
30
|
+
ARI.
|
23
31
|
|
24
|
-
|
25
|
-
The Adjusted Rand Index (ARI) is calculated by using the `adjusted_rand_score` method from the sklearn metrics in
|
26
|
-
Python. The test requires inputs including the model itself and the model's training and test datasets. The model's
|
27
|
-
computed clusters and the true clusters are compared, and the similarities are measured to compute the ARI.
|
32
|
+
### Signs of High Risk
|
28
33
|
|
29
|
-
|
30
|
-
- If the ARI is close to zero, it signifies that the model's cluster assignments are random and don't match the
|
34
|
+
- If the ARI is close to zero, it signifies that the model's cluster assignments are random and do not match the
|
31
35
|
actual dataset clusters, indicating a high risk.
|
32
36
|
- An ARI of less than zero indicates that the model's clustering performance is worse than random.
|
33
37
|
|
34
|
-
|
35
|
-
|
38
|
+
### Strengths
|
39
|
+
|
40
|
+
- ARI is normalized and provides a consistent metric between -1 and +1, irrespective of raw cluster sizes or
|
36
41
|
dataset size variations.
|
37
|
-
- It
|
38
|
-
evaluations.
|
42
|
+
- It does not require a ground truth for computation, making it ideal for unsupervised learning model evaluations.
|
39
43
|
- It penalizes for false positives and false negatives, providing a robust measure of clustering quality.
|
40
44
|
|
41
|
-
|
45
|
+
### Limitations
|
46
|
+
|
42
47
|
- In real-world situations, true clustering is often unknown, which can hinder the practical application of the ARI.
|
43
48
|
- The ARI requires all individual data instances to be independent, which may not always hold true.
|
44
|
-
- It may be difficult to interpret the implications of an ARI score without
|
49
|
+
- It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
|
45
50
|
heavily dependent on the characteristics of the dataset used.
|
46
51
|
"""
|
47
52
|
|
48
53
|
name = "adjusted_rand_index"
|
49
|
-
required_inputs = ["model", "
|
54
|
+
required_inputs = ["model", "dataset"]
|
50
55
|
tasks = ["clustering"]
|
51
56
|
tags = [
|
52
57
|
"sklearn",
|