validmind 2.1.1__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +3 -3
- validmind/api_client.py +2 -3
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +2 -2
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +5 -1
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +56 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +558 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/result_wrapper.py +61 -24
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
- {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
validmind/models/pytorch.py
CHANGED
@@ -4,37 +4,21 @@
|
|
4
4
|
|
5
5
|
from validmind.errors import MissingOrInvalidModelPredictFnError
|
6
6
|
from validmind.logging import get_logger
|
7
|
-
from validmind.vm_models.model import
|
8
|
-
ModelAttributes,
|
9
|
-
VMModel,
|
10
|
-
has_method_with_arguments,
|
11
|
-
)
|
7
|
+
from validmind.vm_models.model import VMModel, has_method_with_arguments
|
12
8
|
|
13
9
|
logger = get_logger(__name__)
|
14
10
|
|
15
11
|
|
16
12
|
class PyTorchModel(VMModel):
|
17
|
-
"""
|
18
|
-
An PyTorch model class that wraps a trained model instance and its associated data.
|
13
|
+
"""PyTorchModel class wraps a PyTorch model"""
|
19
14
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
device_type(str, optional) The device where model is trained
|
24
|
-
"""
|
15
|
+
def __post_init__(self):
|
16
|
+
if not self.model:
|
17
|
+
raise ValueError("Model object is a required argument for PyTorchModel")
|
25
18
|
|
26
|
-
|
27
|
-
self
|
28
|
-
|
29
|
-
input_id: str = None,
|
30
|
-
attributes: ModelAttributes = None,
|
31
|
-
):
|
32
|
-
super().__init__(
|
33
|
-
model=model,
|
34
|
-
input_id=input_id,
|
35
|
-
attributes=attributes,
|
36
|
-
)
|
37
|
-
self._device_type = next(self.model.parameters()).device
|
19
|
+
self.library = "torch"
|
20
|
+
self.name = self.name or "PyTorch Neural Network"
|
21
|
+
self.device_type = next(self.model.parameters()).device
|
38
22
|
|
39
23
|
def predict_proba(self, *args, **kwargs):
|
40
24
|
"""
|
@@ -61,21 +45,3 @@ class PyTorchModel(VMModel):
|
|
61
45
|
import torch
|
62
46
|
|
63
47
|
return self.model.predict(torch.tensor(args[0]).to(self.device_type))
|
64
|
-
|
65
|
-
def model_library(self):
|
66
|
-
"""
|
67
|
-
Returns the model library name
|
68
|
-
"""
|
69
|
-
return "torch"
|
70
|
-
|
71
|
-
def model_class(self):
|
72
|
-
"""
|
73
|
-
Returns the model class name
|
74
|
-
"""
|
75
|
-
return "PyTorchModel"
|
76
|
-
|
77
|
-
def model_name(self):
|
78
|
-
"""
|
79
|
-
Returns model architecture
|
80
|
-
"""
|
81
|
-
return "PyTorch Neural Networks"
|
validmind/models/r_model.py
CHANGED
@@ -6,7 +6,7 @@ import numpy as np
|
|
6
6
|
import pandas as pd
|
7
7
|
|
8
8
|
from validmind.logging import get_logger
|
9
|
-
from validmind.vm_models.model import
|
9
|
+
from validmind.vm_models.model import VMModel
|
10
10
|
|
11
11
|
logger = get_logger(__name__)
|
12
12
|
|
@@ -16,49 +16,23 @@ def get_full_class_name(obj):
|
|
16
16
|
|
17
17
|
|
18
18
|
class RModel(VMModel):
|
19
|
-
"""
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
attributes: ModelAttributes = None,
|
33
|
-
):
|
34
|
-
self.r = r
|
35
|
-
self._is_classification_model = False
|
36
|
-
|
37
|
-
super().__init__(
|
38
|
-
model=model,
|
39
|
-
attributes=attributes,
|
19
|
+
"""An R model class that wraps a "fitted" R model instance and its associated data."""
|
20
|
+
|
21
|
+
def __post_init__(self):
|
22
|
+
self.language = self.r["version"].rx2("version.string")[0]
|
23
|
+
self.library = self.class_ = "R"
|
24
|
+
|
25
|
+
name_map = {
|
26
|
+
"xgb.Booster": "XGBoost",
|
27
|
+
"glm": self.__glm_model_class(),
|
28
|
+
"lm": "Linear Regression",
|
29
|
+
}
|
30
|
+
self.name = self.name or name_map.get(
|
31
|
+
self.__model_class(), self.__model_class()
|
40
32
|
)
|
41
33
|
|
42
34
|
self._is_classification_model = self.__is_classification_model()
|
43
35
|
|
44
|
-
def __get_predict_data_as_df(self, new_data):
|
45
|
-
"""
|
46
|
-
Builds the correct data shape and format for the predict method when the
|
47
|
-
caller has passed a Pandas dataframe as input. This function makes sure to
|
48
|
-
adjust the shape of the input dataset to the predict() signature depending
|
49
|
-
if it's a regular R model or an XGBoost model
|
50
|
-
"""
|
51
|
-
if self.__model_class() == "xgb.Booster":
|
52
|
-
return new_data.df.drop(new_data.target_column, axis=1)
|
53
|
-
|
54
|
-
return new_data.df
|
55
|
-
|
56
|
-
def __model_class(self):
|
57
|
-
"""
|
58
|
-
Returns the model class name
|
59
|
-
"""
|
60
|
-
return self.r["class"](self.model)[0]
|
61
|
-
|
62
36
|
def __is_classification_model(self):
|
63
37
|
"""
|
64
38
|
Only supported classification models are XGBClassifier and GLM with binomial family (logistic regression).
|
@@ -78,6 +52,24 @@ class RModel(VMModel):
|
|
78
52
|
|
79
53
|
return False
|
80
54
|
|
55
|
+
def __get_predict_data_as_df(self, new_data):
|
56
|
+
"""
|
57
|
+
Builds the correct data shape and format for the predict method when the
|
58
|
+
caller has passed a Pandas dataframe as input. This function makes sure to
|
59
|
+
adjust the shape of the input dataset to the predict() signature depending
|
60
|
+
if it's a regular R model or an XGBoost model
|
61
|
+
"""
|
62
|
+
if self.__model_class() == "xgb.Booster":
|
63
|
+
return new_data.df.drop(new_data.target_column, axis=1)
|
64
|
+
|
65
|
+
return new_data.df
|
66
|
+
|
67
|
+
def __model_class(self):
|
68
|
+
"""
|
69
|
+
Returns the model class name
|
70
|
+
"""
|
71
|
+
return self.r["class"](self.model)[0]
|
72
|
+
|
81
73
|
def __glm_model_class(self):
|
82
74
|
"""
|
83
75
|
Returns the model class name for GLM models which include family and link function
|
@@ -142,9 +134,7 @@ class RModel(VMModel):
|
|
142
134
|
|
143
135
|
if new_data_class == "numpy.ndarray":
|
144
136
|
# We need to reconstruct the DataFrame from the ndarray using the column names
|
145
|
-
new_data = pd.DataFrame(
|
146
|
-
new_data, columns=self.test_ds.get_features_columns()
|
147
|
-
)
|
137
|
+
new_data = pd.DataFrame(new_data, columns=self.test_ds.feature_columns)
|
148
138
|
elif new_data_class != "pandas.core.frame.DataFrame":
|
149
139
|
raise ValueError(
|
150
140
|
f"new_data must be a DataFrame or ndarray. Got {new_data_class}"
|
@@ -163,45 +153,6 @@ class RModel(VMModel):
|
|
163
153
|
|
164
154
|
return predicted_probs
|
165
155
|
|
166
|
-
def model_language(self):
|
167
|
-
"""
|
168
|
-
Returns the model library name
|
169
|
-
"""
|
170
|
-
return self.r["version"].rx2("version.string")[0]
|
171
|
-
|
172
|
-
def model_library(self):
|
173
|
-
"""
|
174
|
-
Returns the model library name
|
175
|
-
"""
|
176
|
-
return "R"
|
177
|
-
|
178
|
-
def model_library_version(self, *args, **kwargs):
|
179
|
-
"""
|
180
|
-
Model framework library version
|
181
|
-
"""
|
182
|
-
return "N/A"
|
183
|
-
|
184
|
-
def model_class(self):
|
185
|
-
"""
|
186
|
-
Returns the model class name
|
187
|
-
"""
|
188
|
-
return "R"
|
189
|
-
|
190
|
-
def model_name(self):
|
191
|
-
"""
|
192
|
-
Returns model name
|
193
|
-
"""
|
194
|
-
model_class_name = self.__model_class()
|
195
|
-
|
196
|
-
if model_class_name == "lm":
|
197
|
-
return "Linear Regression"
|
198
|
-
elif model_class_name == "xgb.Booster":
|
199
|
-
return "XGBoost"
|
200
|
-
elif model_class_name == "glm":
|
201
|
-
return self.__glm_model_class()
|
202
|
-
|
203
|
-
return model_class_name
|
204
|
-
|
205
156
|
def regression_coefficients(self):
|
206
157
|
"""
|
207
158
|
Returns the regression coefficients summary of the model
|
validmind/models/sklearn.py
CHANGED
@@ -2,38 +2,23 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import pandas as pd
|
6
|
+
|
5
7
|
from validmind.errors import MissingOrInvalidModelPredictFnError
|
6
8
|
from validmind.logging import get_logger
|
7
|
-
from validmind.vm_models.model import
|
8
|
-
ModelAttributes,
|
9
|
-
VMModel,
|
10
|
-
has_method_with_arguments,
|
11
|
-
)
|
9
|
+
from validmind.vm_models.model import VMModel, has_method_with_arguments
|
12
10
|
|
13
11
|
logger = get_logger(__name__)
|
14
12
|
|
15
13
|
|
16
14
|
class SKlearnModel(VMModel):
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
"""
|
25
|
-
|
26
|
-
def __init__(
|
27
|
-
self,
|
28
|
-
model: object = None, # Trained model instance
|
29
|
-
input_id: str = None,
|
30
|
-
attributes: ModelAttributes = None,
|
31
|
-
):
|
32
|
-
super().__init__(
|
33
|
-
model=model,
|
34
|
-
input_id=input_id,
|
35
|
-
attributes=attributes,
|
36
|
-
)
|
15
|
+
def __post_init__(self):
|
16
|
+
if not self.model:
|
17
|
+
raise ValueError("Model object is a required argument for SKlearnModel")
|
18
|
+
|
19
|
+
self.library = self.model.__class__.__module__.split(".")[0]
|
20
|
+
self.class_ = self.model.__class__.__name__
|
21
|
+
self.name = self.name or type(self.model).__name__
|
37
22
|
|
38
23
|
def predict_proba(self, *args, **kwargs):
|
39
24
|
"""
|
@@ -54,20 +39,36 @@ class SKlearnModel(VMModel):
|
|
54
39
|
"""
|
55
40
|
return self.model.predict(*args, **kwargs)
|
56
41
|
|
57
|
-
def model_library(self):
|
58
|
-
"""
|
59
|
-
Returns the model library name
|
60
|
-
"""
|
61
|
-
return self.model.__class__.__module__.split(".")[0]
|
62
42
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
43
|
+
class CatBoostModel(SKlearnModel):
|
44
|
+
"""Wrapper for CatBoost model"""
|
45
|
+
|
46
|
+
pass
|
47
|
+
|
68
48
|
|
69
|
-
|
49
|
+
class XGBoostModel(SKlearnModel):
|
50
|
+
"""Wrapper for XGBoost model"""
|
51
|
+
|
52
|
+
def __post_init__(self):
|
53
|
+
super().__post_init__()
|
54
|
+
self.library = "xgboost"
|
55
|
+
|
56
|
+
|
57
|
+
class StatsModelsModel(SKlearnModel):
|
58
|
+
"""Wrapper for StatsModels model"""
|
59
|
+
|
60
|
+
def __post_init__(self):
|
61
|
+
super().__post_init__()
|
62
|
+
self.library = "statsmodels"
|
63
|
+
|
64
|
+
def regression_coefficients(self):
|
70
65
|
"""
|
71
|
-
Returns model
|
66
|
+
Returns the regression coefficients summary of the model
|
72
67
|
"""
|
73
|
-
|
68
|
+
raw_summary = self.model.summary()
|
69
|
+
|
70
|
+
table = raw_summary.tables[1].data
|
71
|
+
headers = table.pop(0)
|
72
|
+
headers[0] = "Feature"
|
73
|
+
|
74
|
+
return pd.DataFrame(table, columns=headers)
|
validmind/template.py
CHANGED
@@ -2,20 +2,15 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from pprint import pformat
|
6
|
-
|
7
|
-
import mistune
|
8
|
-
from IPython.display import display
|
9
5
|
from ipywidgets import HTML, Accordion, VBox
|
10
6
|
|
11
7
|
from .html_templates.content_blocks import (
|
12
8
|
failed_content_block_html,
|
13
9
|
non_test_content_block_html,
|
14
|
-
test_content_block_html,
|
15
10
|
)
|
16
11
|
from .logging import get_logger
|
17
12
|
from .tests import LoadTestError, describe_test
|
18
|
-
from .utils import is_notebook
|
13
|
+
from .utils import display, is_notebook
|
19
14
|
from .vm_models import TestSuite
|
20
15
|
|
21
16
|
logger = get_logger(__name__)
|
@@ -26,6 +21,7 @@ CONTENT_TYPE_MAP = {
|
|
26
21
|
"metadata_text": "Metadata Text",
|
27
22
|
"dynamic": "Dynamic Content",
|
28
23
|
"text": "Text",
|
24
|
+
"risk_assessment": "Risk Assessment",
|
29
25
|
}
|
30
26
|
|
31
27
|
|
@@ -66,29 +62,12 @@ def _create_content_widget(content):
|
|
66
62
|
)
|
67
63
|
|
68
64
|
try:
|
69
|
-
|
65
|
+
test_html = describe_test(test_id=content["content_id"], show=False)
|
70
66
|
except LoadTestError:
|
71
67
|
return HTML(failed_content_block_html.format(test_id=content["content_id"]))
|
72
68
|
|
73
69
|
return Accordion(
|
74
|
-
children=[
|
75
|
-
HTML(
|
76
|
-
test_content_block_html.format(
|
77
|
-
title=test_deets["Name"],
|
78
|
-
description=mistune.html(test_deets["Description"]),
|
79
|
-
required_inputs=", ".join(
|
80
|
-
test_deets["Required Inputs"] or ["None"]
|
81
|
-
),
|
82
|
-
params_table="\n".join(
|
83
|
-
[
|
84
|
-
f"<tr><td>{param}</td><td>{pformat(value, indent=4)}</td></tr>"
|
85
|
-
for param, value in test_deets["Params"].items()
|
86
|
-
]
|
87
|
-
),
|
88
|
-
table_display="table" if test_deets["Params"] else "none",
|
89
|
-
)
|
90
|
-
)
|
91
|
-
],
|
70
|
+
children=[HTML(test_html)],
|
92
71
|
titles=[f"{content_type} Block: '{content['content_id']}'"],
|
93
72
|
)
|
94
73
|
|
@@ -117,7 +96,10 @@ def _create_sub_section_widget(sub_sections, section_number):
|
|
117
96
|
contents_widget,
|
118
97
|
)
|
119
98
|
else:
|
120
|
-
accordion.children = (
|
99
|
+
accordion.children = (
|
100
|
+
*accordion.children,
|
101
|
+
HTML("<p>Empty Section</p>"),
|
102
|
+
)
|
121
103
|
|
122
104
|
accordion.set_title(
|
123
105
|
i, f"{section_number}.{i + 1}. {section['title']} ('{section['id']}')"
|
validmind/tests/__init__.py
CHANGED
@@ -6,22 +6,29 @@
|
|
6
6
|
|
7
7
|
import importlib
|
8
8
|
import inspect
|
9
|
+
import json
|
9
10
|
import sys
|
10
11
|
from pathlib import Path
|
11
12
|
from pprint import pformat
|
12
13
|
from typing import Dict
|
14
|
+
from uuid import uuid4
|
13
15
|
|
14
|
-
import mistune
|
15
16
|
import pandas as pd
|
16
|
-
from
|
17
|
-
from ipywidgets import HTML
|
17
|
+
from ipywidgets import HTML, Accordion
|
18
18
|
|
19
19
|
from ..errors import LoadTestError
|
20
20
|
from ..html_templates.content_blocks import test_content_block_html
|
21
21
|
from ..logging import get_logger
|
22
22
|
from ..unit_metrics import run_metric
|
23
23
|
from ..unit_metrics.composite import load_composite_metric
|
24
|
-
from ..utils import
|
24
|
+
from ..utils import (
|
25
|
+
NumpyEncoder,
|
26
|
+
display,
|
27
|
+
format_dataframe,
|
28
|
+
fuzzy_match,
|
29
|
+
md_to_html,
|
30
|
+
test_id_to_name,
|
31
|
+
)
|
25
32
|
from ..vm_models import TestContext, TestInput
|
26
33
|
from .decorator import metric, tags, tasks
|
27
34
|
from .test_providers import LocalTestProvider, TestProvider
|
@@ -75,10 +82,12 @@ def _pretty_list_tests(tests, truncate=True):
|
|
75
82
|
|
76
83
|
table = [
|
77
84
|
{
|
78
|
-
"
|
85
|
+
"ID": test_id,
|
79
86
|
"Name": test_id_to_name(test_id),
|
87
|
+
"Test Type": __test_classes[test_id].test_type,
|
80
88
|
"Description": _test_description(__test_classes[test_id], truncate),
|
81
|
-
"
|
89
|
+
"Required Inputs": __test_classes[test_id].required_inputs,
|
90
|
+
"Params": __test_classes[test_id].default_params or {},
|
82
91
|
}
|
83
92
|
for test_id in tests
|
84
93
|
]
|
@@ -339,7 +348,7 @@ def load_test(test_id: str, reload=False):
|
|
339
348
|
return test
|
340
349
|
|
341
350
|
|
342
|
-
def describe_test(test_id: str = None, raw: bool = False):
|
351
|
+
def describe_test(test_id: str = None, raw: bool = False, show: bool = True):
|
343
352
|
"""Get or show details about the test
|
344
353
|
|
345
354
|
This function can be used to see test details including the test name, description,
|
@@ -365,20 +374,34 @@ def describe_test(test_id: str = None, raw: bool = False):
|
|
365
374
|
if raw:
|
366
375
|
return details
|
367
376
|
|
377
|
+
html = test_content_block_html.format(
|
378
|
+
test_id=test_id,
|
379
|
+
uuid=str(uuid4()),
|
380
|
+
title=f'{details["Name"]}',
|
381
|
+
description=md_to_html(details["Description"].strip()),
|
382
|
+
required_inputs=", ".join(details["Required Inputs"] or ["None"]),
|
383
|
+
params_table="\n".join(
|
384
|
+
[
|
385
|
+
f"<tr><td>{param}</td><td>{pformat(value, indent=4)}</td></tr>"
|
386
|
+
for param, value in details["Params"].items()
|
387
|
+
]
|
388
|
+
),
|
389
|
+
table_display="table" if details["Params"] else "none",
|
390
|
+
example_inputs=json.dumps(
|
391
|
+
{name: f"my_vm_{name}" for name in details["Required Inputs"]},
|
392
|
+
indent=4,
|
393
|
+
),
|
394
|
+
example_params=json.dumps(details["Params"] or {}, indent=4, cls=NumpyEncoder),
|
395
|
+
instructions_display="block" if show else "none",
|
396
|
+
)
|
397
|
+
|
398
|
+
if not show:
|
399
|
+
return html
|
400
|
+
|
368
401
|
display(
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
description=mistune.html(details["Description"].strip()),
|
373
|
-
required_inputs=", ".join(details["Required Inputs"] or ["None"]),
|
374
|
-
params_table="\n".join(
|
375
|
-
[
|
376
|
-
f"<tr><td>{param}</td><td>{pformat(value, indent=4)}</td></tr>"
|
377
|
-
for param, value in details["Params"].items()
|
378
|
-
]
|
379
|
-
),
|
380
|
-
table_display="table" if details["Params"] else "none",
|
381
|
-
)
|
402
|
+
Accordion(
|
403
|
+
children=[HTML(html)],
|
404
|
+
titles=[f"Test Description: {details['Name']} ('{test_id}')"],
|
382
405
|
)
|
383
406
|
)
|
384
407
|
|
@@ -74,7 +74,7 @@ class ANOVAOneWayTable(Metric):
|
|
74
74
|
|
75
75
|
# Select all numerical features if none are specified
|
76
76
|
if features is None:
|
77
|
-
features = self.inputs.dataset.
|
77
|
+
features = self.inputs.dataset.feature_columns_numeric
|
78
78
|
|
79
79
|
anova_results = self.anova_numerical_features(features, p_threshold)
|
80
80
|
|
@@ -72,7 +72,7 @@ class ChiSquaredFeaturesTable(Metric):
|
|
72
72
|
|
73
73
|
# Ensure cat_features is provided
|
74
74
|
if not cat_features:
|
75
|
-
cat_features = self.inputs.dataset.
|
75
|
+
cat_features = self.inputs.dataset.feature_columns_categorical
|
76
76
|
|
77
77
|
df = self.inputs.dataset.df
|
78
78
|
|
@@ -116,10 +116,8 @@ class DescriptiveStatistics(Metric):
|
|
116
116
|
|
117
117
|
def run(self):
|
118
118
|
feature_columns = self.inputs.dataset.feature_columns
|
119
|
-
numerical_feature_columns = self.inputs.dataset.
|
120
|
-
categorical_feature_columns =
|
121
|
-
self.inputs.dataset.get_categorical_features_columns()
|
122
|
-
)
|
119
|
+
numerical_feature_columns = self.inputs.dataset.feature_columns_numeric
|
120
|
+
categorical_feature_columns = self.inputs.dataset.feature_columns_categorical
|
123
121
|
|
124
122
|
df = self.inputs.dataset.df[feature_columns]
|
125
123
|
|
@@ -84,7 +84,7 @@ class Duplicates(ThresholdTest):
|
|
84
84
|
if self.inputs.dataset.text_column:
|
85
85
|
columns = self.inputs.dataset.text_column
|
86
86
|
else:
|
87
|
-
columns = self.inputs.dataset.
|
87
|
+
columns = self.inputs.dataset.feature_columns
|
88
88
|
|
89
89
|
df = self.inputs.dataset.df[columns]
|
90
90
|
# Find duplicate rows
|
@@ -64,13 +64,13 @@ class IsolationForestOutliers(Metric):
|
|
64
64
|
|
65
65
|
def run(self):
|
66
66
|
if self.params["features_columns"] is None:
|
67
|
-
features_list = self.inputs.dataset.
|
67
|
+
features_list = self.inputs.dataset.feature_columns
|
68
68
|
else:
|
69
69
|
features_list = self.params["features_columns"]
|
70
70
|
|
71
71
|
# Check if all elements from features_list are present in the feature columns
|
72
72
|
all_present = all(
|
73
|
-
elem in self.inputs.dataset.
|
73
|
+
elem in self.inputs.dataset.feature_columns for elem in features_list
|
74
74
|
)
|
75
75
|
if not all_present:
|
76
76
|
raise ValueError(
|
@@ -115,7 +115,7 @@ class LaggedCorrelationHeatmap(Metric):
|
|
115
115
|
else:
|
116
116
|
target_col = self.inputs.dataset.target_column
|
117
117
|
|
118
|
-
independent_vars = list(self.inputs.dataset.
|
118
|
+
independent_vars = list(self.inputs.dataset.feature_columns)
|
119
119
|
num_lags = self.params.get("num_lags", 10)
|
120
120
|
|
121
121
|
if isinstance(target_col, list) and len(target_col) == 1:
|
@@ -57,7 +57,7 @@ class TargetRateBarPlots(Metric):
|
|
57
57
|
|
58
58
|
# Use all categorical features if columns is not specified, else use selected columns
|
59
59
|
if columns is None:
|
60
|
-
features = self.inputs.dataset.
|
60
|
+
features = self.inputs.dataset.feature_columns_categorical
|
61
61
|
else:
|
62
62
|
features = columns
|
63
63
|
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""
|
6
|
+
Metrics functions for any Pandas-compatible datasets
|
7
|
+
"""
|
8
|
+
|
9
|
+
|
10
|
+
import plotly.express as px
|
11
|
+
from langdetect import LangDetectException, detect
|
12
|
+
|
13
|
+
from validmind import tags, tasks
|
14
|
+
|
15
|
+
|
16
|
+
@tags("nlp", "text_data", "visualization")
|
17
|
+
@tasks("text_classification", "text_summarization")
|
18
|
+
def LanguageDetection(dataset):
|
19
|
+
"""
|
20
|
+
Detects the language of each text entry in a dataset and visualizes the distribution of languages
|
21
|
+
as a histogram.
|
22
|
+
|
23
|
+
This method checks for a specified text column in the dataset's dataframe, uses a language detection
|
24
|
+
library to determine the language of each text entry, and returns a histogram plot of the language
|
25
|
+
distribution.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
|
29
|
+
and a `text_column` attribute indicating the name of the column containing text. If the
|
30
|
+
`text_column` attribute is not set, a ValueError is raised.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
plotly.graph_objs._figure.Figure: A Plotly histogram plot showing the distribution of detected
|
34
|
+
languages across the dataset's text entries.
|
35
|
+
|
36
|
+
Raises:
|
37
|
+
ValueError: If the `text_column` is not specified in the dataset object.
|
38
|
+
"""
|
39
|
+
# check text column
|
40
|
+
if not dataset.text_column:
|
41
|
+
raise ValueError("Please set text_column name in the Validmind Dataset object")
|
42
|
+
|
43
|
+
# Function to detect language
|
44
|
+
def detect_language(text):
|
45
|
+
try:
|
46
|
+
return detect(text)
|
47
|
+
except LangDetectException:
|
48
|
+
return "Unknown" # Return 'Unknown' if language detection fails
|
49
|
+
|
50
|
+
# Applying the language detection function to each text entry
|
51
|
+
languages = dataset.df[dataset.text_column].apply(detect_language)
|
52
|
+
fig = px.histogram(
|
53
|
+
languages,
|
54
|
+
x=languages,
|
55
|
+
title="Language Distribution",
|
56
|
+
labels={"x": "Language Codes"},
|
57
|
+
)
|
58
|
+
|
59
|
+
return fig
|