validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +3 -3
  3. validmind/api_client.py +2 -3
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/html_templates/__init__.py +0 -0
  13. validmind/html_templates/content_blocks.py +89 -14
  14. validmind/models/__init__.py +7 -4
  15. validmind/models/foundation.py +8 -34
  16. validmind/models/function.py +51 -0
  17. validmind/models/huggingface.py +16 -46
  18. validmind/models/metadata.py +42 -0
  19. validmind/models/pipeline.py +66 -0
  20. validmind/models/pytorch.py +8 -42
  21. validmind/models/r_model.py +33 -82
  22. validmind/models/sklearn.py +39 -38
  23. validmind/template.py +8 -26
  24. validmind/tests/__init__.py +43 -20
  25. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  26. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  27. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  28. validmind/tests/data_validation/Duplicates.py +1 -1
  29. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  30. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  31. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  32. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  34. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  35. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  36. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  37. validmind/tests/decorator.py +2 -2
  38. validmind/tests/model_validation/BertScore.py +100 -98
  39. validmind/tests/model_validation/BleuScore.py +93 -64
  40. validmind/tests/model_validation/ContextualRecall.py +74 -91
  41. validmind/tests/model_validation/MeteorScore.py +86 -74
  42. validmind/tests/model_validation/RegardScore.py +103 -121
  43. validmind/tests/model_validation/RougeScore.py +118 -0
  44. validmind/tests/model_validation/TokenDisparity.py +84 -121
  45. validmind/tests/model_validation/ToxicityScore.py +109 -123
  46. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  47. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  48. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  50. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  51. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  52. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  56. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  57. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  58. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  59. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  60. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  61. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  62. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  63. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  65. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  66. validmind/tests/model_validation/ragas/utils.py +66 -0
  67. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  68. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  69. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  70. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  71. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  72. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  73. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  74. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
  75. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  76. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  83. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  84. validmind/unit_metrics/__init__.py +26 -49
  85. validmind/unit_metrics/composite.py +5 -1
  86. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  87. validmind/utils.py +56 -6
  88. validmind/vm_models/__init__.py +1 -1
  89. validmind/vm_models/dataset/__init__.py +7 -0
  90. validmind/vm_models/dataset/dataset.py +558 -0
  91. validmind/vm_models/dataset/utils.py +146 -0
  92. validmind/vm_models/model.py +97 -72
  93. validmind/vm_models/test/result_wrapper.py +61 -24
  94. validmind/vm_models/test_context.py +1 -1
  95. validmind/vm_models/test_suite/summary.py +3 -4
  96. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
  97. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
  98. validmind/models/catboost.py +0 -33
  99. validmind/models/statsmodels.py +0 -50
  100. validmind/models/xgboost.py +0 -30
  101. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  102. validmind/tests/model_validation/RegardHistogram.py +0 -148
  103. validmind/tests/model_validation/RougeMetrics.py +0 -147
  104. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  105. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  106. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  107. validmind/vm_models/dataset.py +0 -1303
  108. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
  109. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
  110. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -4,37 +4,21 @@
4
4
 
5
5
  from validmind.errors import MissingOrInvalidModelPredictFnError
6
6
  from validmind.logging import get_logger
7
- from validmind.vm_models.model import (
8
- ModelAttributes,
9
- VMModel,
10
- has_method_with_arguments,
11
- )
7
+ from validmind.vm_models.model import VMModel, has_method_with_arguments
12
8
 
13
9
  logger = get_logger(__name__)
14
10
 
15
11
 
16
12
  class PyTorchModel(VMModel):
17
- """
18
- An PyTorch model class that wraps a trained model instance and its associated data.
13
+ """PyTorchModel class wraps a PyTorch model"""
19
14
 
20
- Attributes:
21
- attributes (ModelAttributes, optional): The attributes of the model. Defaults to None.
22
- model (object, optional): The trained model instance. Defaults to None.
23
- device_type(str, optional) The device where model is trained
24
- """
15
+ def __post_init__(self):
16
+ if not self.model:
17
+ raise ValueError("Model object is a required argument for PyTorchModel")
25
18
 
26
- def __init__(
27
- self,
28
- model: object = None, # Trained model instance
29
- input_id: str = None,
30
- attributes: ModelAttributes = None,
31
- ):
32
- super().__init__(
33
- model=model,
34
- input_id=input_id,
35
- attributes=attributes,
36
- )
37
- self._device_type = next(self.model.parameters()).device
19
+ self.library = "torch"
20
+ self.name = self.name or "PyTorch Neural Network"
21
+ self.device_type = next(self.model.parameters()).device
38
22
 
39
23
  def predict_proba(self, *args, **kwargs):
40
24
  """
@@ -61,21 +45,3 @@ class PyTorchModel(VMModel):
61
45
  import torch
62
46
 
63
47
  return self.model.predict(torch.tensor(args[0]).to(self.device_type))
64
-
65
- def model_library(self):
66
- """
67
- Returns the model library name
68
- """
69
- return "torch"
70
-
71
- def model_class(self):
72
- """
73
- Returns the model class name
74
- """
75
- return "PyTorchModel"
76
-
77
- def model_name(self):
78
- """
79
- Returns model architecture
80
- """
81
- return "PyTorch Neural Networks"
@@ -6,7 +6,7 @@ import numpy as np
6
6
  import pandas as pd
7
7
 
8
8
  from validmind.logging import get_logger
9
- from validmind.vm_models.model import ModelAttributes, VMModel
9
+ from validmind.vm_models.model import VMModel
10
10
 
11
11
  logger = get_logger(__name__)
12
12
 
@@ -16,49 +16,23 @@ def get_full_class_name(obj):
16
16
 
17
17
 
18
18
  class RModel(VMModel):
19
- """
20
- An R model class that wraps a "fitted" R model instance and its associated data.
21
-
22
- Attributes:
23
- attributes (ModelAttributes, optional): The attributes of the model. Defaults to None.
24
- model (object, optional): The trained model instance. Defaults to None.
25
- device_type(str, optional) The device where model is trained
26
- """
27
-
28
- def __init__(
29
- self,
30
- r: object = None, # R instance
31
- model: object = None, # Trained model instance
32
- attributes: ModelAttributes = None,
33
- ):
34
- self.r = r
35
- self._is_classification_model = False
36
-
37
- super().__init__(
38
- model=model,
39
- attributes=attributes,
19
+ """An R model class that wraps a "fitted" R model instance and its associated data."""
20
+
21
+ def __post_init__(self):
22
+ self.language = self.r["version"].rx2("version.string")[0]
23
+ self.library = self.class_ = "R"
24
+
25
+ name_map = {
26
+ "xgb.Booster": "XGBoost",
27
+ "glm": self.__glm_model_class(),
28
+ "lm": "Linear Regression",
29
+ }
30
+ self.name = self.name or name_map.get(
31
+ self.__model_class(), self.__model_class()
40
32
  )
41
33
 
42
34
  self._is_classification_model = self.__is_classification_model()
43
35
 
44
- def __get_predict_data_as_df(self, new_data):
45
- """
46
- Builds the correct data shape and format for the predict method when the
47
- caller has passed a Pandas dataframe as input. This function makes sure to
48
- adjust the shape of the input dataset to the predict() signature depending
49
- if it's a regular R model or an XGBoost model
50
- """
51
- if self.__model_class() == "xgb.Booster":
52
- return new_data.df.drop(new_data.target_column, axis=1)
53
-
54
- return new_data.df
55
-
56
- def __model_class(self):
57
- """
58
- Returns the model class name
59
- """
60
- return self.r["class"](self.model)[0]
61
-
62
36
  def __is_classification_model(self):
63
37
  """
64
38
  Only supported classification models are XGBClassifier and GLM with binomial family (logistic regression).
@@ -78,6 +52,24 @@ class RModel(VMModel):
78
52
 
79
53
  return False
80
54
 
55
+ def __get_predict_data_as_df(self, new_data):
56
+ """
57
+ Builds the correct data shape and format for the predict method when the
58
+ caller has passed a Pandas dataframe as input. This function makes sure to
59
+ adjust the shape of the input dataset to the predict() signature depending
60
+ if it's a regular R model or an XGBoost model
61
+ """
62
+ if self.__model_class() == "xgb.Booster":
63
+ return new_data.df.drop(new_data.target_column, axis=1)
64
+
65
+ return new_data.df
66
+
67
+ def __model_class(self):
68
+ """
69
+ Returns the model class name
70
+ """
71
+ return self.r["class"](self.model)[0]
72
+
81
73
  def __glm_model_class(self):
82
74
  """
83
75
  Returns the model class name for GLM models which include family and link function
@@ -142,9 +134,7 @@ class RModel(VMModel):
142
134
 
143
135
  if new_data_class == "numpy.ndarray":
144
136
  # We need to reconstruct the DataFrame from the ndarray using the column names
145
- new_data = pd.DataFrame(
146
- new_data, columns=self.test_ds.get_features_columns()
147
- )
137
+ new_data = pd.DataFrame(new_data, columns=self.test_ds.feature_columns)
148
138
  elif new_data_class != "pandas.core.frame.DataFrame":
149
139
  raise ValueError(
150
140
  f"new_data must be a DataFrame or ndarray. Got {new_data_class}"
@@ -163,45 +153,6 @@ class RModel(VMModel):
163
153
 
164
154
  return predicted_probs
165
155
 
166
- def model_language(self):
167
- """
168
- Returns the model library name
169
- """
170
- return self.r["version"].rx2("version.string")[0]
171
-
172
- def model_library(self):
173
- """
174
- Returns the model library name
175
- """
176
- return "R"
177
-
178
- def model_library_version(self, *args, **kwargs):
179
- """
180
- Model framework library version
181
- """
182
- return "N/A"
183
-
184
- def model_class(self):
185
- """
186
- Returns the model class name
187
- """
188
- return "R"
189
-
190
- def model_name(self):
191
- """
192
- Returns model name
193
- """
194
- model_class_name = self.__model_class()
195
-
196
- if model_class_name == "lm":
197
- return "Linear Regression"
198
- elif model_class_name == "xgb.Booster":
199
- return "XGBoost"
200
- elif model_class_name == "glm":
201
- return self.__glm_model_class()
202
-
203
- return model_class_name
204
-
205
156
  def regression_coefficients(self):
206
157
  """
207
158
  Returns the regression coefficients summary of the model
@@ -2,38 +2,23 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ import pandas as pd
6
+
5
7
  from validmind.errors import MissingOrInvalidModelPredictFnError
6
8
  from validmind.logging import get_logger
7
- from validmind.vm_models.model import (
8
- ModelAttributes,
9
- VMModel,
10
- has_method_with_arguments,
11
- )
9
+ from validmind.vm_models.model import VMModel, has_method_with_arguments
12
10
 
13
11
  logger = get_logger(__name__)
14
12
 
15
13
 
16
14
  class SKlearnModel(VMModel):
17
- """
18
- An SKlearn model class that wraps a trained model instance and its associated data.
19
-
20
- Attributes:
21
- attributes (ModelAttributes, optional): The attributes of the model. Defaults to None.
22
- model (object, optional): The trained model instance. Defaults to None.
23
- device_type(str, optional) The device where model is trained
24
- """
25
-
26
- def __init__(
27
- self,
28
- model: object = None, # Trained model instance
29
- input_id: str = None,
30
- attributes: ModelAttributes = None,
31
- ):
32
- super().__init__(
33
- model=model,
34
- input_id=input_id,
35
- attributes=attributes,
36
- )
15
+ def __post_init__(self):
16
+ if not self.model:
17
+ raise ValueError("Model object is a required argument for SKlearnModel")
18
+
19
+ self.library = self.model.__class__.__module__.split(".")[0]
20
+ self.class_ = self.model.__class__.__name__
21
+ self.name = self.name or type(self.model).__name__
37
22
 
38
23
  def predict_proba(self, *args, **kwargs):
39
24
  """
@@ -54,20 +39,36 @@ class SKlearnModel(VMModel):
54
39
  """
55
40
  return self.model.predict(*args, **kwargs)
56
41
 
57
- def model_library(self):
58
- """
59
- Returns the model library name
60
- """
61
- return self.model.__class__.__module__.split(".")[0]
62
42
 
63
- def model_class(self):
64
- """
65
- Returns the model class name
66
- """
67
- return self.model.__class__.__name__
43
+ class CatBoostModel(SKlearnModel):
44
+ """Wrapper for CatBoost model"""
45
+
46
+ pass
47
+
68
48
 
69
- def model_name(self):
49
+ class XGBoostModel(SKlearnModel):
50
+ """Wrapper for XGBoost model"""
51
+
52
+ def __post_init__(self):
53
+ super().__post_init__()
54
+ self.library = "xgboost"
55
+
56
+
57
+ class StatsModelsModel(SKlearnModel):
58
+ """Wrapper for StatsModels model"""
59
+
60
+ def __post_init__(self):
61
+ super().__post_init__()
62
+ self.library = "statsmodels"
63
+
64
+ def regression_coefficients(self):
70
65
  """
71
- Returns model name
66
+ Returns the regression coefficients summary of the model
72
67
  """
73
- return type(self.model).__name__
68
+ raw_summary = self.model.summary()
69
+
70
+ table = raw_summary.tables[1].data
71
+ headers = table.pop(0)
72
+ headers[0] = "Feature"
73
+
74
+ return pd.DataFrame(table, columns=headers)
validmind/template.py CHANGED
@@ -2,20 +2,15 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from pprint import pformat
6
-
7
- import mistune
8
- from IPython.display import display
9
5
  from ipywidgets import HTML, Accordion, VBox
10
6
 
11
7
  from .html_templates.content_blocks import (
12
8
  failed_content_block_html,
13
9
  non_test_content_block_html,
14
- test_content_block_html,
15
10
  )
16
11
  from .logging import get_logger
17
12
  from .tests import LoadTestError, describe_test
18
- from .utils import is_notebook
13
+ from .utils import display, is_notebook
19
14
  from .vm_models import TestSuite
20
15
 
21
16
  logger = get_logger(__name__)
@@ -26,6 +21,7 @@ CONTENT_TYPE_MAP = {
26
21
  "metadata_text": "Metadata Text",
27
22
  "dynamic": "Dynamic Content",
28
23
  "text": "Text",
24
+ "risk_assessment": "Risk Assessment",
29
25
  }
30
26
 
31
27
 
@@ -66,29 +62,12 @@ def _create_content_widget(content):
66
62
  )
67
63
 
68
64
  try:
69
- test_deets = describe_test(test_id=content["content_id"], raw=True)
65
+ test_html = describe_test(test_id=content["content_id"], show=False)
70
66
  except LoadTestError:
71
67
  return HTML(failed_content_block_html.format(test_id=content["content_id"]))
72
68
 
73
69
  return Accordion(
74
- children=[
75
- HTML(
76
- test_content_block_html.format(
77
- title=test_deets["Name"],
78
- description=mistune.html(test_deets["Description"]),
79
- required_inputs=", ".join(
80
- test_deets["Required Inputs"] or ["None"]
81
- ),
82
- params_table="\n".join(
83
- [
84
- f"<tr><td>{param}</td><td>{pformat(value, indent=4)}</td></tr>"
85
- for param, value in test_deets["Params"].items()
86
- ]
87
- ),
88
- table_display="table" if test_deets["Params"] else "none",
89
- )
90
- )
91
- ],
70
+ children=[HTML(test_html)],
92
71
  titles=[f"{content_type} Block: '{content['content_id']}'"],
93
72
  )
94
73
 
@@ -117,7 +96,10 @@ def _create_sub_section_widget(sub_sections, section_number):
117
96
  contents_widget,
118
97
  )
119
98
  else:
120
- accordion.children = (*accordion.children, HTML("<p>Empty Section</p>"))
99
+ accordion.children = (
100
+ *accordion.children,
101
+ HTML("<p>Empty Section</p>"),
102
+ )
121
103
 
122
104
  accordion.set_title(
123
105
  i, f"{section_number}.{i + 1}. {section['title']} ('{section['id']}')"
@@ -6,22 +6,29 @@
6
6
 
7
7
  import importlib
8
8
  import inspect
9
+ import json
9
10
  import sys
10
11
  from pathlib import Path
11
12
  from pprint import pformat
12
13
  from typing import Dict
14
+ from uuid import uuid4
13
15
 
14
- import mistune
15
16
  import pandas as pd
16
- from IPython.display import display
17
- from ipywidgets import HTML
17
+ from ipywidgets import HTML, Accordion
18
18
 
19
19
  from ..errors import LoadTestError
20
20
  from ..html_templates.content_blocks import test_content_block_html
21
21
  from ..logging import get_logger
22
22
  from ..unit_metrics import run_metric
23
23
  from ..unit_metrics.composite import load_composite_metric
24
- from ..utils import format_dataframe, fuzzy_match, test_id_to_name
24
+ from ..utils import (
25
+ NumpyEncoder,
26
+ display,
27
+ format_dataframe,
28
+ fuzzy_match,
29
+ md_to_html,
30
+ test_id_to_name,
31
+ )
25
32
  from ..vm_models import TestContext, TestInput
26
33
  from .decorator import metric, tags, tasks
27
34
  from .test_providers import LocalTestProvider, TestProvider
@@ -75,10 +82,12 @@ def _pretty_list_tests(tests, truncate=True):
75
82
 
76
83
  table = [
77
84
  {
78
- "Test Type": __test_classes[test_id].test_type,
85
+ "ID": test_id,
79
86
  "Name": test_id_to_name(test_id),
87
+ "Test Type": __test_classes[test_id].test_type,
80
88
  "Description": _test_description(__test_classes[test_id], truncate),
81
- "ID": test_id,
89
+ "Required Inputs": __test_classes[test_id].required_inputs,
90
+ "Params": __test_classes[test_id].default_params or {},
82
91
  }
83
92
  for test_id in tests
84
93
  ]
@@ -339,7 +348,7 @@ def load_test(test_id: str, reload=False):
339
348
  return test
340
349
 
341
350
 
342
- def describe_test(test_id: str = None, raw: bool = False):
351
+ def describe_test(test_id: str = None, raw: bool = False, show: bool = True):
343
352
  """Get or show details about the test
344
353
 
345
354
  This function can be used to see test details including the test name, description,
@@ -365,20 +374,34 @@ def describe_test(test_id: str = None, raw: bool = False):
365
374
  if raw:
366
375
  return details
367
376
 
377
+ html = test_content_block_html.format(
378
+ test_id=test_id,
379
+ uuid=str(uuid4()),
380
+ title=f'{details["Name"]}',
381
+ description=md_to_html(details["Description"].strip()),
382
+ required_inputs=", ".join(details["Required Inputs"] or ["None"]),
383
+ params_table="\n".join(
384
+ [
385
+ f"<tr><td>{param}</td><td>{pformat(value, indent=4)}</td></tr>"
386
+ for param, value in details["Params"].items()
387
+ ]
388
+ ),
389
+ table_display="table" if details["Params"] else "none",
390
+ example_inputs=json.dumps(
391
+ {name: f"my_vm_{name}" for name in details["Required Inputs"]},
392
+ indent=4,
393
+ ),
394
+ example_params=json.dumps(details["Params"] or {}, indent=4, cls=NumpyEncoder),
395
+ instructions_display="block" if show else "none",
396
+ )
397
+
398
+ if not show:
399
+ return html
400
+
368
401
  display(
369
- HTML(
370
- test_content_block_html.format(
371
- title=f'{details["Name"]}',
372
- description=mistune.html(details["Description"].strip()),
373
- required_inputs=", ".join(details["Required Inputs"] or ["None"]),
374
- params_table="\n".join(
375
- [
376
- f"<tr><td>{param}</td><td>{pformat(value, indent=4)}</td></tr>"
377
- for param, value in details["Params"].items()
378
- ]
379
- ),
380
- table_display="table" if details["Params"] else "none",
381
- )
402
+ Accordion(
403
+ children=[HTML(html)],
404
+ titles=[f"Test Description: {details['Name']} ('{test_id}')"],
382
405
  )
383
406
  )
384
407
 
@@ -74,7 +74,7 @@ class ANOVAOneWayTable(Metric):
74
74
 
75
75
  # Select all numerical features if none are specified
76
76
  if features is None:
77
- features = self.inputs.dataset.get_numeric_features_columns()
77
+ features = self.inputs.dataset.feature_columns_numeric
78
78
 
79
79
  anova_results = self.anova_numerical_features(features, p_threshold)
80
80
 
@@ -72,7 +72,7 @@ class ChiSquaredFeaturesTable(Metric):
72
72
 
73
73
  # Ensure cat_features is provided
74
74
  if not cat_features:
75
- cat_features = self.inputs.dataset.get_categorical_features_columns()
75
+ cat_features = self.inputs.dataset.feature_columns_categorical
76
76
 
77
77
  df = self.inputs.dataset.df
78
78
 
@@ -116,10 +116,8 @@ class DescriptiveStatistics(Metric):
116
116
 
117
117
  def run(self):
118
118
  feature_columns = self.inputs.dataset.feature_columns
119
- numerical_feature_columns = self.inputs.dataset.get_numeric_features_columns()
120
- categorical_feature_columns = (
121
- self.inputs.dataset.get_categorical_features_columns()
122
- )
119
+ numerical_feature_columns = self.inputs.dataset.feature_columns_numeric
120
+ categorical_feature_columns = self.inputs.dataset.feature_columns_categorical
123
121
 
124
122
  df = self.inputs.dataset.df[feature_columns]
125
123
 
@@ -84,7 +84,7 @@ class Duplicates(ThresholdTest):
84
84
  if self.inputs.dataset.text_column:
85
85
  columns = self.inputs.dataset.text_column
86
86
  else:
87
- columns = self.inputs.dataset.get_features_columns()
87
+ columns = self.inputs.dataset.feature_columns
88
88
 
89
89
  df = self.inputs.dataset.df[columns]
90
90
  # Find duplicate rows
@@ -64,13 +64,13 @@ class IsolationForestOutliers(Metric):
64
64
 
65
65
  def run(self):
66
66
  if self.params["features_columns"] is None:
67
- features_list = self.inputs.dataset.get_features_columns()
67
+ features_list = self.inputs.dataset.feature_columns
68
68
  else:
69
69
  features_list = self.params["features_columns"]
70
70
 
71
71
  # Check if all elements from features_list are present in the feature columns
72
72
  all_present = all(
73
- elem in self.inputs.dataset.get_features_columns() for elem in features_list
73
+ elem in self.inputs.dataset.feature_columns for elem in features_list
74
74
  )
75
75
  if not all_present:
76
76
  raise ValueError(
@@ -115,7 +115,7 @@ class LaggedCorrelationHeatmap(Metric):
115
115
  else:
116
116
  target_col = self.inputs.dataset.target_column
117
117
 
118
- independent_vars = list(self.inputs.dataset.get_features_columns())
118
+ independent_vars = list(self.inputs.dataset.feature_columns)
119
119
  num_lags = self.params.get("num_lags", 10)
120
120
 
121
121
  if isinstance(target_col, list) and len(target_col) == 1:
@@ -57,7 +57,7 @@ class TargetRateBarPlots(Metric):
57
57
 
58
58
  # Use all categorical features if columns is not specified, else use selected columns
59
59
  if columns is None:
60
- features = self.inputs.dataset.get_categorical_features_columns()
60
+ features = self.inputs.dataset.feature_columns_categorical
61
61
  else:
62
62
  features = columns
63
63
 
@@ -0,0 +1,59 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ """
6
+ Metrics functions for any Pandas-compatible datasets
7
+ """
8
+
9
+
10
+ import plotly.express as px
11
+ from langdetect import LangDetectException, detect
12
+
13
+ from validmind import tags, tasks
14
+
15
+
16
+ @tags("nlp", "text_data", "visualization")
17
+ @tasks("text_classification", "text_summarization")
18
+ def LanguageDetection(dataset):
19
+ """
20
+ Detects the language of each text entry in a dataset and visualizes the distribution of languages
21
+ as a histogram.
22
+
23
+ This method checks for a specified text column in the dataset's dataframe, uses a language detection
24
+ library to determine the language of each text entry, and returns a histogram plot of the language
25
+ distribution.
26
+
27
+ Args:
28
+ dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
29
+ and a `text_column` attribute indicating the name of the column containing text. If the
30
+ `text_column` attribute is not set, a ValueError is raised.
31
+
32
+ Returns:
33
+ plotly.graph_objs._figure.Figure: A Plotly histogram plot showing the distribution of detected
34
+ languages across the dataset's text entries.
35
+
36
+ Raises:
37
+ ValueError: If the `text_column` is not specified in the dataset object.
38
+ """
39
+ # check text column
40
+ if not dataset.text_column:
41
+ raise ValueError("Please set text_column name in the Validmind Dataset object")
42
+
43
+ # Function to detect language
44
+ def detect_language(text):
45
+ try:
46
+ return detect(text)
47
+ except LangDetectException:
48
+ return "Unknown" # Return 'Unknown' if language detection fails
49
+
50
+ # Applying the language detection function to each text entry
51
+ languages = dataset.df[dataset.text_column].apply(detect_language)
52
+ fig = px.histogram(
53
+ languages,
54
+ x=languages,
55
+ title="Language Distribution",
56
+ labels={"x": "Language Codes"},
57
+ )
58
+
59
+ return fig